update

2024-07-22 16:01:27 +00:00
58 changed files with 316 additions and 453 deletions
@@ -19,11 +19,10 @@ jobs:
    strategy:
      fail-fast: false
      max-parallel: 1
-    runs-on: 
-      group: aws-g6-4xlarge-plus
+    runs-on: [single-gpu, nvidia-gpu, a10, ci]
    container:
      image: diffusers/diffusers-pytorch-compile-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -7,7 +7,7 @@ on:

 env:
  DIFFUSERS_IS_CI: yes
-  HF_HUB_ENABLE_HF_TRANSFER: 1
+  HF_HOME: /mnt/cache
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
  PYTEST_TIMEOUT: 600
@@ -27,6 +27,10 @@ jobs:
        uses: actions/checkout@v3
        with:
          fetch-depth: 2
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
      - name: Install dependencies
        run: |
          pip install -e .
@@ -46,17 +50,16 @@ jobs:
          path: reports

  run_nightly_tests_for_torch_pipelines:
-    name: Nightly Torch Pipelines CUDA Tests
+    name: Torch Pipelines CUDA Nightly Tests
    needs: setup_torch_cuda_pipeline_matrix
    strategy:
      fail-fast: false
-      max-parallel: 8
      matrix:
        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -64,16 +67,19 @@ jobs:
          fetch-depth: 2
      - name: NVIDIA-SMI
        run: nvidia-smi
+
      - name: Install dependencies
        run: |
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
          python -m uv pip install pytest-reportlog
+
      - name: Environment
        run: |
          python utils/print_env.py
-      - name: Pipeline CUDA Test
+
+      - name: Nightly PyTorch CUDA checkpoint (pipelines) tests
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
@@ -84,36 +90,38 @@ jobs:
            --make-reports=tests_pipeline_${{ matrix.module }}_cuda \
            --report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
            tests/pipelines/${{ matrix.module }}
+
      - name: Failure short reports
        if: ${{ failure() }}
        run: |
          cat reports/tests_pipeline_${{ matrix.module }}_cuda_stats.txt
          cat reports/tests_pipeline_${{ matrix.module }}_cuda_failures_short.txt
+
      - name: Test suite reports artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
        with:
          name: pipeline_${{ matrix.module }}_test_reports
          path: reports
+
      - name: Generate Report and Notify Channel
        if: always()
        run: |
          pip install slack_sdk tabulate
-          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_nightly_tests_for_other_torch_modules:
-    name: Nightly Torch CUDA Tests
+    name: Torch Non-Pipelines CUDA Nightly Tests
    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
    defaults:
      run:
        shell: bash
    strategy:
      matrix:
-        max-parallel: 2
-        module: [models, schedulers, lora, others, single_file, examples]
+        module: [models, schedulers, others, examples]
    steps:
    - name: Checkout diffusers
      uses: actions/checkout@v3
@@ -125,8 +133,8 @@ jobs:
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
-        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
        python -m uv pip install pytest-reportlog
+
    - name: Environment
      run: python utils/print_env.py

@@ -150,6 +158,7 @@ jobs:
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
+        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v --make-reports=examples_torch_cuda \
          --report-log=examples_torch_cuda.log \
@@ -172,7 +181,64 @@ jobs:
      if: always()
      run: |
        pip install slack_sdk tabulate
-        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+
+  run_lora_nightly_tests:
+    name: Nightly LoRA Tests with PEFT and TORCH
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --gpus 0
+    defaults:
+      run:
+        shell: bash
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
+        python -m uv pip install pytest-reportlog
+
+    - name: Environment
+      run: python utils/print_env.py
+
+    - name: Run nightly LoRA tests with PEFT and Torch
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+        CUBLAS_WORKSPACE_CONFIG: :16:8
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx" \
+          --make-reports=tests_torch_lora_cuda \
+          --report-log=tests_torch_lora_cuda.log \
+          tests/lora
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_torch_lora_cuda_stats.txt
+        cat reports/tests_torch_lora_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: torch_lora_cuda_test_reports
+        path: reports
+
+    - name: Generate Report and Notify Channel
+      if: always()
+      run: |
+        pip install slack_sdk tabulate
+        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_flax_tpu_tests:
    name: Nightly Flax TPU Tests
@@ -228,14 +294,14 @@ jobs:
      if: always()
      run: |
        pip install slack_sdk tabulate
-        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_nightly_onnx_tests:
    name: Nightly ONNXRuntime CUDA tests on Ubuntu
    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-onnxruntime-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/

    steps:
    - name: Checkout diffusers
@@ -252,10 +318,11 @@ jobs:
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
        python -m uv pip install pytest-reportlog
+
    - name: Environment
      run: python utils/print_env.py

-    - name: Run Nightly ONNXRuntime CUDA tests
+    - name: Run nightly ONNXRuntime CUDA tests
      env:
        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
@@ -282,7 +349,7 @@ jobs:
      if: always()
      run: |
        pip install slack_sdk tabulate
-        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+        python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_nightly_tests_apple_m1:
    name: Nightly PyTorch MPS tests on MacOS
@@ -344,4 +411,4 @@ jobs:
        if: always()
        run: |
          pip install slack_sdk tabulate
-          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
@@ -11,9 +11,11 @@ on:

 env:
  DIFFUSERS_IS_CI: yes
+  HF_HOME: /mnt/cache
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
  PYTEST_TIMEOUT: 600
+  RUN_SLOW: yes
  PIPELINE_USAGE_CUTOFF: 50000

 jobs:
@@ -50,7 +52,7 @@ jobs:
          path: reports

  torch_pipelines_cuda_tests:
-    name: Torch Pipelines CUDA Tests
+    name: Torch Pipelines CUDA Slow Tests
    needs: setup_torch_cuda_pipeline_matrix
    strategy:
      fail-fast: false
@@ -60,7 +62,7 @@ jobs:
    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
    steps:
      - name: Checkout diffusers
        uses: actions/checkout@v3
@@ -104,7 +106,7 @@ jobs:
    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
    defaults:
      run:
        shell: bash
@@ -122,13 +124,12 @@ jobs:
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
-        python -m uv pip install peft@git+https://github.com/huggingface/peft.git

    - name: Environment
      run: |
        python utils/print_env.py

-    - name: Run PyTorch CUDA tests
+    - name: Run slow PyTorch CUDA tests
      env:
        HF_TOKEN: ${{ secrets.HF_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
@@ -152,6 +153,61 @@ jobs:
        name: torch_cuda_test_reports
        path: reports

+  peft_cuda_tests:
+    name: PEFT CUDA Tests
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    container:
+      image: diffusers/diffusers-pytorch-cuda
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0
+    defaults:
+      run:
+        shell: bash
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
+        python -m pip install -U peft@git+https://github.com/huggingface/peft.git
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
+    - name: Run slow PEFT CUDA tests
+      env:
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
+        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
+        CUBLAS_WORKSPACE_CONFIG: :16:8
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "not Flax and not Onnx and not PEFTLoRALoading" \
+          --make-reports=tests_peft_cuda \
+          tests/lora/
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
+          -s -v -k "lora and not Flax and not Onnx and not PEFTLoRALoading" \
+          --make-reports=tests_peft_cuda_models_lora \
+          tests/models/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_peft_cuda_stats.txt
+        cat reports/tests_peft_cuda_failures_short.txt
+        cat reports/tests_peft_cuda_models_lora_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: torch_peft_test_reports
+        path: reports
+
  flax_tpu_tests:
    name: Flax TPU Tests
    runs-on: docker-tpu
@@ -253,7 +309,7 @@ jobs:

    container:
      image: diffusers/diffusers-pytorch-compile-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/

    steps:
    - name: Checkout diffusers
@@ -295,7 +351,7 @@ jobs:

    container:
      image: diffusers/diffusers-pytorch-xformers-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/

    steps:
    - name: Checkout diffusers
@@ -336,7 +392,7 @@ jobs:

    container:
      image: diffusers/diffusers-pytorch-cuda
-      options: --gpus 0 --shm-size "16gb" --ipc host
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/

    steps:
    - name: Checkout diffusers
@@ -38,7 +38,6 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
        datasets \
        hf-doc-builder \
        huggingface-hub \
-        hf_transfer \
        Jinja2 \
        librosa \
        numpy==1.26.4 \
@@ -38,7 +38,6 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    datasets \
    hf-doc-builder \
    huggingface-hub \
-    hf_transfer \
    Jinja2 \
    librosa \
    numpy==1.26.4 \
@@ -38,7 +38,6 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    datasets \
    hf-doc-builder \
    huggingface-hub \
-    hf_transfer \
    Jinja2 \
    librosa \
    numpy==1.26.4 \
@@ -38,7 +38,6 @@ RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
        datasets \
        hf-doc-builder \
        huggingface-hub \
-        hf_transfer \
        Jinja2 \
        librosa \
        numpy==1.26.4 \
@@ -340,7 +340,6 @@ Now you can wrap all these components together in a training loop with 🤗 Acce
 ...                 loss = F.mse_loss(noise_pred, noise)
 ...                 accelerator.backward(loss)

-...             if (step + 1) % config.gradient_accumulation_steps == 0:
 ...                 accelerator.clip_grad_norm_(model.parameters(), 1.0)
 ...                 optimizer.step()
 ...                 lr_scheduler.step()
@@ -1302,7 +1302,7 @@ def main(args):
                text_encoder_lora_layers=text_encoder_one_lora_layers_to_save,
            )
        if args.train_text_encoder_ti:
-            embedding_handler.save_embeddings(f"{args.output_dir}/{Path(args.output_dir).name}_emb.safetensors")
+            embedding_handler.save_embeddings(f"{output_dir}/{args.output_dir}_emb.safetensors")

    def load_model_hook(models, input_dir):
        unet_ = None
@@ -1605,15 +1605,13 @@ def main(args):
                if isinstance(model, type(unwrap_model(unet))):
                    unet_lora_layers_to_save = convert_state_dict_to_diffusers(get_peft_model_state_dict(model))
                elif isinstance(model, type(unwrap_model(text_encoder_one))):
-                    if args.train_text_encoder:
-                        text_encoder_one_lora_layers_to_save = convert_state_dict_to_diffusers(
-                            get_peft_model_state_dict(model)
-                        )
+                    text_encoder_one_lora_layers_to_save = convert_state_dict_to_diffusers(
+                        get_peft_model_state_dict(model)
+                    )
                elif isinstance(model, type(unwrap_model(text_encoder_two))):
-                    if args.train_text_encoder:
-                        text_encoder_two_lora_layers_to_save = convert_state_dict_to_diffusers(
-                            get_peft_model_state_dict(model)
-                        )
+                    text_encoder_two_lora_layers_to_save = convert_state_dict_to_diffusers(
+                        get_peft_model_state_dict(model)
+                    )
                else:
                    raise ValueError(f"unexpected save model: {model.__class__}")

@@ -1627,7 +1625,7 @@ def main(args):
                text_encoder_2_lora_layers=text_encoder_two_lora_layers_to_save,
            )
        if args.train_text_encoder_ti:
-            embedding_handler.save_embeddings(f"{args.output_dir}/{Path(args.output_dir).name}_emb.safetensors")
+            embedding_handler.save_embeddings(f"{output_dir}/{args.output_dir}_emb.safetensors")

    def load_model_hook(models, input_dir):
        unet_ = None
@@ -24,6 +24,7 @@ from ..utils import (
    is_bitsandbytes_available,
    is_flax_available,
    is_google_colab,
+    is_notebook,
    is_peft_available,
    is_safetensors_available,
    is_torch_available,
@@ -106,6 +107,8 @@ class EnvironmentCommand(BaseDiffusersCLICommand):

        platform_info = platform.platform()

+        is_notebook_str = "Yes" if is_notebook() else "No"
+
        is_google_colab_str = "Yes" if is_google_colab() else "No"

        accelerator = "NA"
@@ -120,7 +123,7 @@ class EnvironmentCommand(BaseDiffusersCLICommand):
                out_str = out_str.decode("utf-8")

                if len(out_str) > 0:
-                    accelerator = out_str.strip()
+                    accelerator = out_str.strip() + " VRAM"
            except FileNotFoundError:
                pass
        elif platform.system() == "Darwin":  # Mac OS
@@ -152,6 +155,7 @@ class EnvironmentCommand(BaseDiffusersCLICommand):
        info = {
            "🤗 Diffusers version": version,
            "Platform": platform_info,
+            "Running on a notebook?": is_notebook_str,
            "Running on Google Colab?": is_google_colab_str,
            "Python version": platform.python_version(),
            "PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})",
@@ -677,21 +677,6 @@ class Attention(nn.Module):
                concatenated_bias = torch.cat([self.to_k.bias.data, self.to_v.bias.data])
                self.to_kv.bias.copy_(concatenated_bias)

-        # handle added projections for SD3 and others.
-        if hasattr(self, "add_q_proj") and hasattr(self, "add_k_proj") and hasattr(self, "add_v_proj"):
-            concatenated_weights = torch.cat(
-                [self.add_q_proj.weight.data, self.add_k_proj.weight.data, self.add_v_proj.weight.data]
-            )
-            in_features = concatenated_weights.shape[1]
-            out_features = concatenated_weights.shape[0]
-
-            self.to_added_qkv = nn.Linear(in_features, out_features, bias=True, device=device, dtype=dtype)
-            self.to_added_qkv.weight.copy_(concatenated_weights)
-            concatenated_bias = torch.cat(
-                [self.add_q_proj.bias.data, self.add_k_proj.bias.data, self.add_v_proj.bias.data]
-            )
-            self.to_added_qkv.bias.copy_(concatenated_bias)
-
        self.fused_projections = fuse


@@ -1182,6 +1167,7 @@ class AuraFlowAttnProcessor2_0:
        attn: Attention,
        hidden_states: torch.FloatTensor,
        encoder_hidden_states: torch.FloatTensor = None,
+        i=0,
        *args,
        **kwargs,
    ) -> torch.FloatTensor:
@@ -1722,109 +1708,6 @@ class HunyuanAttnProcessor2_0:
        return hidden_states


-class FusedHunyuanAttnProcessor2_0:
-    r"""
-    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0) with fused
-    projection layers. This is used in the HunyuanDiT model. It applies a s normalization layer and rotary embedding on
-    query and key vector.
-    """
-
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError(
-                "FusedHunyuanAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
-            )
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        temb: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        from .embeddings import apply_rotary_emb
-
-        residual = hidden_states
-        if attn.spatial_norm is not None:
-            hidden_states = attn.spatial_norm(hidden_states, temb)
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-
-        if attn.group_norm is not None:
-            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-        if encoder_hidden_states is None:
-            qkv = attn.to_qkv(hidden_states)
-            split_size = qkv.shape[-1] // 3
-            query, key, value = torch.split(qkv, split_size, dim=-1)
-        else:
-            if attn.norm_cross:
-                encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-            query = attn.to_q(hidden_states)
-
-            kv = attn.to_kv(encoder_hidden_states)
-            split_size = kv.shape[-1] // 2
-            key, value = torch.split(kv, split_size, dim=-1)
-
-        inner_dim = key.shape[-1]
-        head_dim = inner_dim // attn.heads
-
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        if attn.norm_q is not None:
-            query = attn.norm_q(query)
-        if attn.norm_k is not None:
-            key = attn.norm_k(key)
-
-        # Apply RoPE if needed
-        if image_rotary_emb is not None:
-            query = apply_rotary_emb(query, image_rotary_emb)
-            if not attn.is_cross_attention:
-                key = apply_rotary_emb(key, image_rotary_emb)
-
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-        )
-
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-        hidden_states = hidden_states.to(query.dtype)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-
 class LuminaAttnProcessor2_0:
    r"""
    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
@@ -26,7 +26,6 @@ from ..attention_processor import (
    AttentionProcessor,
    AttnAddedKVProcessor,
    AttnProcessor,
-    FusedAttnProcessor2_0,
 )
 from ..modeling_outputs import AutoencoderKLOutput
 from ..modeling_utils import ModelMixin
@@ -63,9 +62,6 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin):
            If enabled it will force the VAE to run in float32 for high image resolution pipelines, such as SD-XL. VAE
            can be fine-tuned / trained to a lower range without loosing too much precision in which case
            `force_upcast` can be set to `False` - see: https://huggingface.co/madebyollin/sdxl-vae-fp16-fix
-        mid_block_add_attention (`bool`, *optional*, default to `True`):
-            If enabled, the mid_block of the Encoder and Decoder will have attention blocks. If set to false, the
-            mid_block will only have resnet blocks
    """

    _supports_gradient_checkpointing = True
@@ -91,7 +87,6 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin):
        force_upcast: float = True,
        use_quant_conv: bool = True,
        use_post_quant_conv: bool = True,
-        mid_block_add_attention: bool = True,
    ):
        super().__init__()

@@ -105,7 +100,6 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin):
            act_fn=act_fn,
            norm_num_groups=norm_num_groups,
            double_z=True,
-            mid_block_add_attention=mid_block_add_attention,
        )

        # pass init params to Decoder
@@ -117,7 +111,6 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin):
            layers_per_block=layers_per_block,
            norm_num_groups=norm_num_groups,
            act_fn=act_fn,
-            mid_block_add_attention=mid_block_add_attention,
        )

        self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1) if use_quant_conv else None
@@ -493,8 +486,6 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin):
            if isinstance(module, Attention):
                module.fuse_projections(fuse=True)

-        self.set_attn_processor(FusedAttnProcessor2_0())
-
    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
    def unfuse_qkv_projections(self):
        """Disables the fused QKV projection if enabled.
@@ -22,7 +22,7 @@ import torch.nn as nn
 from ..configuration_utils import ConfigMixin, register_to_config
 from ..loaders import FromOriginalModelMixin, PeftAdapterMixin
 from ..models.attention import JointTransformerBlock
-from ..models.attention_processor import Attention, AttentionProcessor, FusedJointAttnProcessor2_0
+from ..models.attention_processor import Attention, AttentionProcessor
 from ..models.modeling_outputs import Transformer2DModelOutput
 from ..models.modeling_utils import ModelMixin
 from ..utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
@@ -196,7 +196,7 @@ class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
        for name, module in self.named_children():
            fn_recursive_attn_processor(name, module, processor)

-    # Copied from diffusers.models.transformers.transformer_sd3.SD3Transformer2DModel.fuse_qkv_projections
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
    def fuse_qkv_projections(self):
        """
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
@@ -220,8 +220,6 @@ class SD3ControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginal
            if isinstance(module, Attention):
                module.fuse_projections(fuse=True)

-        self.set_attn_processor(FusedJointAttnProcessor2_0())
-
    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
    def unfuse_qkv_projections(self):
        """Disables the fused QKV projection if enabled.
@@ -29,7 +29,6 @@ from .attention_processor import (
    AttentionProcessor,
    AttnAddedKVProcessor,
    AttnProcessor,
-    FusedAttnProcessor2_0,
 )
 from .controlnet import ControlNetConditioningEmbedding
 from .embeddings import TimestepEmbedding, Timesteps
@@ -1002,8 +1001,6 @@ class UNetControlNetXSModel(ModelMixin, ConfigMixin):
            if isinstance(module, Attention):
                module.fuse_projections(fuse=True)

-        self.set_attn_processor(FusedAttnProcessor2_0())
-
    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
    def unfuse_qkv_projections(self):
        """Disables the fused QKV projection if enabled.
@@ -20,7 +20,7 @@ from ...configuration_utils import ConfigMixin, register_to_config
 from ...utils import logging
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..attention import FeedForward
-from ..attention_processor import Attention, AttentionProcessor, FusedHunyuanAttnProcessor2_0, HunyuanAttnProcessor2_0
+from ..attention_processor import Attention, AttentionProcessor, HunyuanAttnProcessor2_0
 from ..embeddings import (
    HunyuanCombinedTimestepTextSizeStyleEmbedding,
    PatchEmbed,
@@ -317,7 +317,7 @@ class HunyuanDiT2DModel(ModelMixin, ConfigMixin):
        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)

-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedHunyuanAttnProcessor2_0
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
    def fuse_qkv_projections(self):
        """
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
@@ -341,8 +341,6 @@ class HunyuanDiT2DModel(ModelMixin, ConfigMixin):
            if isinstance(module, Attention):
                module.fuse_projections(fuse=True)

-        self.set_attn_processor(FusedHunyuanAttnProcessor2_0())
-
    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
    def unfuse_qkv_projections(self):
        """Disables the fused QKV projection if enabled.
@@ -23,7 +23,7 @@ import torch.nn as nn
 from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
 from ...models.attention import JointTransformerBlock
-from ...models.attention_processor import Attention, AttentionProcessor, FusedJointAttnProcessor2_0
+from ...models.attention_processor import Attention, AttentionProcessor
 from ...models.modeling_utils import ModelMixin
 from ...models.normalization import AdaLayerNormContinuous
 from ...utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
@@ -211,7 +211,7 @@ class SD3Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOrigi
        for name, module in self.named_children():
            fn_recursive_attn_processor(name, module, processor)

-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedJointAttnProcessor2_0
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections
    def fuse_qkv_projections(self):
        """
        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
@@ -235,8 +235,6 @@ class SD3Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOrigi
            if isinstance(module, Attention):
                module.fuse_projections(fuse=True)

-        self.set_attn_processor(FusedJointAttnProcessor2_0())
-
    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
    def unfuse_qkv_projections(self):
        """Disables the fused QKV projection if enabled.
@@ -30,7 +30,6 @@ from ..attention_processor import (
    AttentionProcessor,
    AttnAddedKVProcessor,
    AttnProcessor,
-    FusedAttnProcessor2_0,
 )
 from ..embeddings import (
    GaussianFourierProjection,
@@ -891,8 +890,6 @@ class UNet2DConditionModel(
            if isinstance(module, Attention):
                module.fuse_projections(fuse=True)

-        self.set_attn_processor(FusedAttnProcessor2_0())
-
    def unfuse_qkv_projections(self):
        """Disables the fused QKV projection if enabled.

@@ -31,7 +31,6 @@ from ..attention_processor import (
    AttentionProcessor,
    AttnAddedKVProcessor,
    AttnProcessor,
-    FusedAttnProcessor2_0,
 )
 from ..embeddings import TimestepEmbedding, Timesteps
 from ..modeling_utils import ModelMixin
@@ -533,8 +532,6 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
            if isinstance(module, Attention):
                module.fuse_projections(fuse=True)

-        self.set_attn_processor(FusedAttnProcessor2_0())
-
    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
    def unfuse_qkv_projections(self):
        """Disables the fused QKV projection if enabled.
@@ -29,7 +29,6 @@ from ..attention_processor import (
    AttentionProcessor,
    AttnAddedKVProcessor,
    AttnProcessor,
-    FusedAttnProcessor2_0,
 )
 from ..embeddings import TimestepEmbedding, Timesteps
 from ..modeling_utils import ModelMixin
@@ -499,8 +498,6 @@ class I2VGenXLUNet(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
            if isinstance(module, Attention):
                module.fuse_projections(fuse=True)

-        self.set_attn_processor(FusedAttnProcessor2_0())
-
    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
    def unfuse_qkv_projections(self):
        """Disables the fused QKV projection if enabled.
@@ -29,7 +29,6 @@ from ..attention_processor import (
    AttnAddedKVProcessor,
    AttnProcessor,
    AttnProcessor2_0,
-    FusedAttnProcessor2_0,
    IPAdapterAttnProcessor,
    IPAdapterAttnProcessor2_0,
 )
@@ -930,8 +929,6 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
            if isinstance(module, Attention):
                module.fuse_projections(fuse=True)

-        self.set_attn_processor(FusedAttnProcessor2_0())
-
    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
    def unfuse_qkv_projections(self):
        """Disables the fused QKV projection if enabled.
@@ -286,7 +286,6 @@ class AudioLDM2Pipeline(DiffusionPipeline):
                The sequence of generated hidden-states.
        """
        max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens
-        model_kwargs = self.language_model._get_initial_cache_position(inputs_embeds, model_kwargs)
        for _ in range(max_new_tokens):
            # prepare model inputs
            model_inputs = prepare_inputs_for_generation(inputs_embeds, **model_kwargs)
@@ -260,6 +260,7 @@ class AuraFlowPipeline(DiffusionPipeline):
                padding="max_length",
                return_tensors="pt",
            )
+            text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
            text_input_ids = text_inputs["input_ids"]
            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids

@@ -272,7 +273,6 @@ class AuraFlowPipeline(DiffusionPipeline):
                    f" {max_length} tokens: {removed_text}"
                )

-            text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
            prompt_embeds = self.text_encoder(**text_inputs)[0]
            prompt_attention_mask = text_inputs["attention_mask"].unsqueeze(-1).expand(prompt_embeds.shape)
            prompt_embeds = prompt_embeds * prompt_attention_mask
@@ -73,6 +73,7 @@ from .import_utils import (
    is_librosa_available,
    is_matplotlib_available,
    is_note_seq_available,
+    is_notebook,
    is_onnx_available,
    is_peft_available,
    is_peft_version,
@@ -321,7 +321,18 @@ try:
 except importlib_metadata.PackageNotFoundError:
    _bitsandbytes_available = False

-_is_google_colab = "google.colab" in sys.modules or any(k.startswith("COLAB_") for k in os.environ)
+# Taken from `huggingface_hub`.
+_is_notebook = False
+try:
+    shell_class = get_ipython().__class__  # type: ignore # noqa: F821
+    for parent_class in shell_class.__mro__:  # e.g. "is subclass of"
+        if parent_class.__name__ == "ZMQInteractiveShell":
+            _is_notebook = True  # Jupyter notebook, Google colab or qtconsole
+            break
+except NameError:
+    pass  # Probably standard Python interpreter
+
+_is_google_colab = "google.colab" in sys.modules


 def is_torch_available():
@@ -432,6 +443,10 @@ def is_bitsandbytes_available():
    return _bitsandbytes_available


+def is_notebook():
+    return _is_notebook
+
+
 def is_google_colab():
    return _is_google_colab

@@ -124,9 +124,11 @@ class ModelUtilsTest(unittest.TestCase):
            if p1.data.ne(p2.data).sum() > 0:
                assert False, "Parameters not the same!"

-    @unittest.skip("Flaky behaviour on CI. Re-enable after migrating to new runners")
-    @unittest.skipIf(torch_device == "mps", reason="Test not supported for MPS.")
    def test_one_request_upon_cached(self):
+        # TODO: For some reason this test fails on MPS where no HEAD call is made.
+        if torch_device == "mps":
+            return
+
        use_safetensors = False

        with tempfile.TemporaryDirectory() as tmpdirname:
@@ -1,67 +0,0 @@
-# coding=utf-8
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import torch
-
-from diffusers.models.transformers import TransformerTemporalModel
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    torch_device,
-)
-
-from ..test_modeling_common import ModelTesterMixin
-
-
-enable_full_determinism()
-
-
-class TemporalTransformerTests(ModelTesterMixin, unittest.TestCase):
-    model_class = TransformerTemporalModel
-    main_input_name = "hidden_states"
-
-    @property
-    def dummy_input(self):
-        batch_size = 2
-        num_channels = 4
-        height = width = 32
-
-        hidden_states = torch.randn((batch_size, num_channels, height, width)).to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
-
-        return {
-            "hidden_states": hidden_states,
-            "timestep": timestep,
-        }
-
-    @property
-    def input_shape(self):
-        return (4, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (4, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "num_attention_heads": 8,
-            "attention_head_dim": 4,
-            "in_channels": 4,
-            "num_layers": 1,
-            "norm_num_groups": 1,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
@@ -73,15 +73,14 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    def get_dummy_components(self):
        torch.manual_seed(0)
        unet = AudioLDM2UNet2DConditionModel(
-            block_out_channels=(8, 16),
-            layers_per_block=1,
-            norm_num_groups=8,
+            block_out_channels=(32, 64),
+            layers_per_block=2,
            sample_size=32,
            in_channels=4,
            out_channels=4,
            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=(8, 16),
+            cross_attention_dim=([None, 16, 32], [None, 16, 32]),
        )
        scheduler = DDIMScheduler(
            beta_start=0.00085,
@@ -92,10 +91,9 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        )
        torch.manual_seed(0)
        vae = AutoencoderKL(
-            block_out_channels=[8, 16],
+            block_out_channels=[32, 64],
            in_channels=1,
            out_channels=1,
-            norm_num_groups=8,
            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
            latent_channels=4,
@@ -104,34 +102,32 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        text_branch_config = ClapTextConfig(
            bos_token_id=0,
            eos_token_id=2,
-            hidden_size=8,
+            hidden_size=16,
            intermediate_size=37,
            layer_norm_eps=1e-05,
-            num_attention_heads=1,
-            num_hidden_layers=1,
+            num_attention_heads=2,
+            num_hidden_layers=2,
            pad_token_id=1,
            vocab_size=1000,
-            projection_dim=8,
+            projection_dim=16,
        )
        audio_branch_config = ClapAudioConfig(
-            spec_size=8,
+            spec_size=64,
            window_size=4,
-            num_mel_bins=8,
+            num_mel_bins=64,
            intermediate_size=37,
            layer_norm_eps=1e-05,
-            depths=[1, 1],
-            num_attention_heads=[1, 1],
-            num_hidden_layers=1,
+            depths=[2, 2],
+            num_attention_heads=[2, 2],
+            num_hidden_layers=2,
            hidden_size=192,
-            projection_dim=8,
+            projection_dim=16,
            patch_size=2,
            patch_stride=2,
            patch_embed_input_channels=4,
        )
        text_encoder_config = ClapConfig.from_text_audio_configs(
-            text_config=text_branch_config,
-            audio_config=audio_branch_config,
-            projection_dim=16,
+            text_config=text_branch_config, audio_config=audio_branch_config, projection_dim=16
        )
        text_encoder = ClapModel(text_encoder_config)
        tokenizer = RobertaTokenizer.from_pretrained("hf-internal-testing/tiny-random-roberta", model_max_length=77)
@@ -145,8 +141,8 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
            d_model=32,
            d_ff=37,
            d_kv=8,
-            num_heads=1,
-            num_layers=1,
+            num_heads=2,
+            num_layers=2,
        )
        text_encoder_2 = T5EncoderModel(text_encoder_2_config)
        tokenizer_2 = T5Tokenizer.from_pretrained("hf-internal-testing/tiny-random-T5Model", model_max_length=77)
@@ -154,8 +150,8 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        torch.manual_seed(0)
        language_model_config = GPT2Config(
            n_embd=16,
-            n_head=1,
-            n_layer=1,
+            n_head=2,
+            n_layer=2,
            vocab_size=1000,
            n_ctx=99,
            n_positions=99,
@@ -164,11 +160,7 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        language_model.config.max_new_tokens = 8

        torch.manual_seed(0)
-        projection_model = AudioLDM2ProjectionModel(
-            text_encoder_dim=16,
-            text_encoder_1_dim=32,
-            langauge_model_dim=16,
-        )
+        projection_model = AudioLDM2ProjectionModel(text_encoder_dim=16, text_encoder_1_dim=32, langauge_model_dim=16)

        vocoder_config = SpeechT5HifiGanConfig(
            model_in_dim=8,
@@ -228,18 +220,7 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):

        audio_slice = audio[:10]
        expected_slice = np.array(
-            [
-                2.602e-03,
-                1.729e-03,
-                1.863e-03,
-                -2.219e-03,
-                -2.656e-03,
-                -2.017e-03,
-                -2.648e-03,
-                -2.115e-03,
-                -2.502e-03,
-                -2.081e-03,
-            ]
+            [0.0025, 0.0018, 0.0018, -0.0023, -0.0026, -0.0020, -0.0026, -0.0021, -0.0027, -0.0020]
        )

        assert np.abs(audio_slice - expected_slice).max() < 1e-4
@@ -380,7 +361,7 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):

        audio_slice = audio[:10]
        expected_slice = np.array(
-            [0.0026, 0.0017, 0.0018, -0.0022, -0.0026, -0.002, -0.0026, -0.0021, -0.0025, -0.0021]
+            [0.0025, 0.0018, 0.0018, -0.0023, -0.0026, -0.0020, -0.0026, -0.0021, -0.0027, -0.0020]
        )

        assert np.abs(audio_slice - expected_slice).max() < 1e-4
@@ -407,7 +388,7 @@ class AudioLDM2PipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        assert audios.shape == (batch_size, 256)

        # test num_waveforms_per_prompt for single prompt
-        num_waveforms_per_prompt = 1
+        num_waveforms_per_prompt = 2
        audios = audioldm_pipe(prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt).audios

        assert audios.shape == (num_waveforms_per_prompt, 256)
@@ -37,12 +37,7 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.import_utils import is_xformers_available
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    floats_tensor,
-    require_torch_gpu,
-    torch_device,
-)
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, require_torch_gpu, torch_device

 from ..pipeline_params import (
    IMAGE_TO_IMAGE_IMAGE_PARAMS,
@@ -233,6 +228,12 @@ class ControlNetPipelineSDXLFastTests(
    def test_attention_slicing_forward_pass(self):
        return self._test_attention_slicing_forward_pass(expected_max_diff=2e-3)

+    def test_dict_tuple_outputs_equivalent(self):
+        expected_slice = None
+        if torch_device == "cpu":
+            expected_slice = np.array([0.5490, 0.5053, 0.4676, 0.5816, 0.5364, 0.4830, 0.5937, 0.5719, 0.4318])
+        super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
+
    @unittest.skipIf(
        torch_device != "cuda" or not is_xformers_available(),
        reason="XFormers attention is only available with CUDA and `xformers` installed",
@@ -340,8 +341,7 @@ class ControlNetPipelineSDXLFastTests(

        output = sd_pipe(**inputs)
        image_slice = output.images[0, -3:, -3:, -1]
-
-        expected_slice = np.array([0.5460, 0.4943, 0.4635, 0.5832, 0.5366, 0.4815, 0.6034, 0.5741, 0.4341])
+        expected_slice = np.array([0.549, 0.5053, 0.4676, 0.5816, 0.5364, 0.483, 0.5937, 0.5719, 0.4318])

        # make sure that it's equal
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-4
@@ -195,7 +195,7 @@ class StableDiffusionXLControlNetPipelineFastTests(
            expected_pipe_slice = None
            if torch_device == "cpu":
                expected_pipe_slice = np.array(
-                    [0.7335, 0.5866, 0.5623, 0.6242, 0.5751, 0.5999, 0.4091, 0.4590, 0.5054]
+                    [0.7331, 0.5907, 0.5667, 0.6029, 0.5679, 0.5968, 0.4033, 0.4761, 0.5090]
                )
        return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)

@@ -348,8 +348,9 @@ class StableDiffusionXLControlNetPipelineFastTests(

        output = sd_pipe(**inputs)
        image_slice = output.images[0, -3:, -3:, -1]
-
-        expected_slice = np.array([0.7335, 0.5866, 0.5623, 0.6242, 0.5751, 0.5999, 0.4091, 0.4590, 0.5054])
+        expected_slice = np.array(
+            [0.7330834, 0.590667, 0.5667336, 0.6029023, 0.5679491, 0.5968194, 0.4032986, 0.47612396, 0.5089609]
+        )

        # make sure that it's equal
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-4
@@ -370,7 +371,7 @@ class StableDiffusionXLControlNetPipelineFastTests(
        image_slice = image[0, -3:, -3:, -1]

        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.7820, 0.6195, 0.6193, 0.7045, 0.6706, 0.5837, 0.4147, 0.5232, 0.4868])
+        expected_slice = np.array([0.7799, 0.614, 0.6162, 0.7082, 0.6662, 0.5833, 0.4148, 0.5182, 0.4866])

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

@@ -964,8 +965,9 @@ class StableDiffusionSSD1BControlNetPipelineFastTests(StableDiffusionXLControlNe

        output = sd_pipe(**inputs)
        image_slice = output.images[0, -3:, -3:, -1]
-
-        expected_slice = np.array([0.7212, 0.5890, 0.5491, 0.6425, 0.5970, 0.6091, 0.4418, 0.4556, 0.5032])
+        expected_slice = np.array(
+            [0.6831671, 0.5702532, 0.5459845, 0.6299793, 0.58563006, 0.6033695, 0.4493941, 0.46132287, 0.5035841]
+        )

        # make sure that it's equal
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-4
@@ -973,8 +975,7 @@ class StableDiffusionSSD1BControlNetPipelineFastTests(StableDiffusionXLControlNe
    def test_ip_adapter_single(self):
        expected_pipe_slice = None
        if torch_device == "cpu":
-            expected_pipe_slice = np.array([0.7212, 0.5890, 0.5491, 0.6425, 0.5970, 0.6091, 0.4418, 0.4556, 0.5032])
-
+            expected_pipe_slice = np.array([0.6832, 0.5703, 0.5460, 0.6300, 0.5856, 0.6034, 0.4494, 0.4613, 0.5036])
        return super().test_ip_adapter_single(from_ssd1b=True, expected_pipe_slice=expected_pipe_slice)

    def test_controlnet_sdxl_lcm(self):
@@ -993,7 +994,7 @@ class StableDiffusionSSD1BControlNetPipelineFastTests(StableDiffusionXLControlNe
        image_slice = image[0, -3:, -3:, -1]

        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.6787, 0.5117, 0.5558, 0.6963, 0.6571, 0.5928, 0.4121, 0.5468, 0.5057])
+        expected_slice = np.array([0.6850, 0.5135, 0.5545, 0.7033, 0.6617, 0.5971, 0.4165, 0.5480, 0.5070])

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

@@ -178,8 +178,7 @@ class ControlNetPipelineSDXLImg2ImgFastTests(
    def test_ip_adapter_single(self):
        expected_pipe_slice = None
        if torch_device == "cpu":
-            expected_pipe_slice = np.array([0.6276, 0.5271, 0.5205, 0.5393, 0.5774, 0.5872, 0.5456, 0.5415, 0.5354])
-        # TODO: update after slices.p
+            expected_pipe_slice = np.array([0.6265, 0.5441, 0.5384, 0.5446, 0.5810, 0.5908, 0.5414, 0.5428, 0.5353])
        return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)

    def test_stable_diffusion_xl_controlnet_img2img(self):
@@ -180,10 +180,11 @@ class StableDiffusion3ControlNetPipelineFastTests(unittest.TestCase, PipelineTes
        image = output.images

        image_slice = image[0, -3:, -3:, -1]
-
        assert image.shape == (1, 32, 32, 3)

-        expected_slice = np.array([0.5767, 0.7100, 0.5981, 0.5674, 0.5952, 0.4102, 0.5093, 0.5044, 0.6030])
+        expected_slice = np.array(
+            [0.5761719, 0.71777344, 0.59228516, 0.578125, 0.6020508, 0.39453125, 0.46728516, 0.51708984, 0.58984375]
+        )

        assert (
            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -36,12 +36,7 @@ from diffusers.utils.testing_utils import (
 )

 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import (
-    PipelineTesterMixin,
-    check_qkv_fusion_matches_attn_procs_length,
-    check_qkv_fusion_processors_exist,
-    to_np,
-)
+from ..test_pipelines_common import PipelineTesterMixin, to_np


 enable_full_determinism()
@@ -266,16 +261,6 @@ class HunyuanDiTPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        original_image_slice = image[0, -3:, -3:, -1]

        pipe.transformer.fuse_qkv_projections()
-        # TODO (sayakpaul): will refactor this once `fuse_qkv_projections()` has been added
-        # to the pipeline level.
-        pipe.transformer.fuse_qkv_projections()
-        assert check_qkv_fusion_processors_exist(
-            pipe.transformer
-        ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
-        assert check_qkv_fusion_matches_attn_procs_length(
-            pipe.transformer, pipe.transformer.original_attn_processors
-        ), "Something wrong with the attention processors concerning the fused QKV projections."
-
        inputs = self.get_dummy_inputs(device)
        inputs["return_dict"] = False
        image_fused = pipe(**inputs)[0]
@@ -39,6 +39,7 @@ from diffusers.utils.testing_utils import (
    enable_full_determinism,
    floats_tensor,
    numpy_cosine_similarity_distance,
+    print_tensor_test,
    require_torch_gpu,
    skip_mps,
    slow,
@@ -264,5 +265,6 @@ class I2VGenXLPipelineSlowTests(unittest.TestCase):
        assert image.shape == (num_frames, 704, 1280, 3)

        image_slice = image[0, -3:, -3:, -1]
+        print_tensor_test(image_slice.flatten())
        expected_slice = np.array([0.5482, 0.6244, 0.6274, 0.4584, 0.5935, 0.5937, 0.4579, 0.5767, 0.5892])
        assert numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice.flatten()) < 1e-3
@@ -94,7 +94,7 @@ class KandinskyPipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCase)

        assert image.shape == (1, 64, 64, 3)

-        expected_slice = np.array([0.2893, 0.1464, 0.4603, 0.3529, 0.4612, 0.7701, 0.4027, 0.3051, 0.5155])
+        expected_slice = np.array([0.0000, 0.0000, 0.6777, 0.1363, 0.3624, 0.7868, 0.3869, 0.3395, 0.5068])

        assert (
            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -200,7 +200,7 @@ class KandinskyPipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest.Te

        assert image.shape == (1, 64, 64, 3)

-        expected_slice = np.array([0.4852, 0.4136, 0.4539, 0.4781, 0.4680, 0.5217, 0.4973, 0.4089, 0.4977])
+        expected_slice = np.array([0.4260, 0.3596, 0.4571, 0.3890, 0.4087, 0.5137, 0.4819, 0.4116, 0.5053])

        assert (
            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -305,14 +305,11 @@ class KandinskyPipelineInpaintCombinedFastTests(PipelineTesterMixin, unittest.Te
        )[0]

        image_slice = image[0, -3:, -3:, -1]
-
        image_from_tuple_slice = image_from_tuple[0, -3:, -3:, -1]

-        print(image_from_tuple_slice)
-
        assert image.shape == (1, 64, 64, 3)

-        expected_slice = np.array([0.0320, 0.0860, 0.4013, 0.0518, 0.2484, 0.5847, 0.4411, 0.2321, 0.4593])
+        expected_slice = np.array([0.0477, 0.0808, 0.2972, 0.2705, 0.3620, 0.6247, 0.4464, 0.2870, 0.3530])

        assert (
            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -211,13 +211,12 @@ class KandinskyPriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        )[0]

        image_slice = image[0, -10:]
-
        image_from_tuple_slice = image_from_tuple[0, -10:]

        assert image.shape == (1, 32)

        expected_slice = np.array(
-            [-0.5948, 0.1875, -0.1523, -1.1995, -1.4061, -0.6367, -1.4607, -0.6406, 0.8793, -0.3891]
+            [-0.0532, 1.7120, 0.3656, -1.0852, -0.8946, -1.1756, 0.4348, 0.2482, 0.5146, -0.1156]
        )

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -99,7 +99,7 @@ class KandinskyV22PipelineCombinedFastTests(PipelineTesterMixin, unittest.TestCa

        assert image.shape == (1, 64, 64, 3)

-        expected_slice = np.array([0.3076, 0.2729, 0.5668, 0.0522, 0.3384, 0.7028, 0.4908, 0.3659, 0.6243])
+        expected_slice = np.array([0.3013, 0.0471, 0.5176, 0.1817, 0.2566, 0.7076, 0.6712, 0.4421, 0.7503])

        assert (
            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -221,7 +221,7 @@ class KandinskyV22PipelineImg2ImgCombinedFastTests(PipelineTesterMixin, unittest

        assert image.shape == (1, 64, 64, 3)

-        expected_slice = np.array([0.4445, 0.4287, 0.4596, 0.3919, 0.3730, 0.5039, 0.4834, 0.4269, 0.5521])
+        expected_slice = np.array([0.4353, 0.4710, 0.5128, 0.4806, 0.5054, 0.5348, 0.5224, 0.4603, 0.5025])

        assert (
            np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -213,13 +213,12 @@ class KandinskyV22PriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase)
        )[0]

        image_slice = image[0, -10:]
-
        image_from_tuple_slice = image_from_tuple[0, -10:]

        assert image.shape == (1, 32)

        expected_slice = np.array(
-            [-0.5948, 0.1875, -0.1523, -1.1995, -1.4061, -0.6367, -1.4607, -0.6406, 0.8793, -0.3891]
+            [-0.0532, 1.7120, 0.3656, -1.0852, -0.8946, -1.1756, 0.4348, 0.2482, 0.5146, -0.1156]
        )

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -30,12 +30,7 @@ from transformers import (
 )

 from diffusers import KandinskyV22PriorEmb2EmbPipeline, PriorTransformer, UnCLIPScheduler
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    floats_tensor,
-    skip_mps,
-    torch_device,
-)
+from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, skip_mps, torch_device

 from ..test_pipelines_common import PipelineTesterMixin

@@ -215,13 +210,23 @@ class KandinskyV22PriorEmb2EmbPipelineFastTests(PipelineTesterMixin, unittest.Te
        )[0]

        image_slice = image[0, -10:]
-
        image_from_tuple_slice = image_from_tuple[0, -10:]

        assert image.shape == (1, 32)

        expected_slice = np.array(
-            [-0.8947, 0.7225, -0.2400, -1.4224, -1.9268, -1.1454, -1.8220, -0.7972, 1.0465, -0.5207]
+            [
+                0.1071284,
+                1.3330271,
+                0.61260223,
+                -0.6691065,
+                -0.3846852,
+                -1.0303661,
+                0.22716111,
+                0.03348901,
+                0.30040675,
+                -0.24805029,
+            ]
        )

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
@@ -28,7 +28,9 @@ from diffusers import (
    StableDiffusionXLControlNetPipeline,
    UNet2DConditionModel,
 )
-from diffusers.utils.testing_utils import enable_full_determinism
+from diffusers.utils.testing_utils import (
+    enable_full_determinism,
+)
 from diffusers.utils.torch_utils import randn_tensor

 from ..pipeline_params import (
@@ -235,7 +237,9 @@ class StableDiffusionXLControlNetPAGPipelineFastTests(
            64,
            3,
        ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
-        expected_slice = np.array([0.7036, 0.5613, 0.5526, 0.6129, 0.5610, 0.5842, 0.4228, 0.4612, 0.5017])
+        expected_slice = np.array(
+            [0.6819614, 0.5551478, 0.5499094, 0.5769566, 0.53942275, 0.5707505, 0.41131154, 0.47833863, 0.49982738]
+        )

        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
        assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}"
@@ -259,7 +263,9 @@ class StableDiffusionXLControlNetPAGPipelineFastTests(
            64,
            3,
        ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
-        expected_slice = np.array([0.6888, 0.5398, 0.5603, 0.6086, 0.5541, 0.5957, 0.4332, 0.4643, 0.5154])
+        expected_slice = np.array(
+            [0.66685176, 0.53207266, 0.5541569, 0.5912994, 0.5368312, 0.58433825, 0.42607725, 0.46805605, 0.5098659]
+        )

        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
        assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}"
@@ -283,7 +283,9 @@ class StableDiffusionXLPAGPipelineFastTests(
            64,
            3,
        ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
-        expected_slice = np.array([0.5382, 0.5439, 0.4704, 0.4569, 0.5234, 0.4834, 0.5289, 0.5039, 0.4764])
+        expected_slice = np.array(
+            [0.55341685, 0.55503535, 0.47299808, 0.43274558, 0.4965323, 0.46310428, 0.51455414, 0.5015592, 0.46913484]
+        )

        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
        self.assertLessEqual(max_diff, 1e-3)
@@ -260,7 +260,9 @@ class StableDiffusionXLPAGImg2ImgPipelineFastTests(
            32,
            3,
        ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
-        expected_slice = np.array([0.4613, 0.4902, 0.4406, 0.6788, 0.5611, 0.4529, 0.5893, 0.5975, 0.5226])
+        expected_slice = np.array(
+            [0.46703637, 0.4917526, 0.44394222, 0.6895079, 0.56251144, 0.45474228, 0.5957122, 0.6016377, 0.5276273]
+        )

        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
        assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}"
@@ -265,7 +265,9 @@ class StableDiffusionXLPAGInpaintPipelineFastTests(
            64,
            3,
        ), f"the shape of the output image should be (1, 64, 64, 3) but got {image.shape}"
-        expected_slice = np.array([0.8366, 0.5513, 0.6105, 0.6213, 0.6957, 0.7400, 0.6614, 0.6102, 0.5239])
+        expected_slice = np.array(
+            [0.8115454, 0.53986573, 0.5825281, 0.6028964, 0.67128646, 0.7046922, 0.6418713, 0.5933924, 0.5154763]
+        )

        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
        assert max_diff < 1e-3, f"output is different from expected, {image_slice.flatten()}"
@@ -181,7 +181,7 @@ class ShapEPipelineFastTests(PipelineTesterMixin, unittest.TestCase):

        assert image.shape == (32, 16)

-        expected_slice = np.array([-1.0000, -0.6559, 1.0000, -0.9096, -0.7252, 0.8211, -0.7647, -0.3308, 0.6462])
+        expected_slice = np.array([-1.0000, -0.6241, 1.0000, -0.8978, -0.6866, 0.7876, -0.7473, -0.2874, 0.6103])
        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

    def test_inference_batch_consistent(self):
@@ -168,12 +168,22 @@ class StableCascadePriorPipelineFastTests(PipelineTesterMixin, unittest.TestCase
        image_from_tuple = pipe(**self.get_dummy_inputs(device), return_dict=False)[0]

        image_slice = image[0, 0, 0, -10:]
-
        image_from_tuple_slice = image_from_tuple[0, 0, 0, -10:]
        assert image.shape == (1, 16, 24, 24)

        expected_slice = np.array(
-            [94.5498, -21.9481, -117.5025, -192.8760, 38.0117, 73.4709, 38.1142, -185.5593, -47.7869, 167.2853]
+            [
+                96.139565,
+                -20.213179,
+                -116.40341,
+                -191.57129,
+                39.350136,
+                74.80767,
+                39.782352,
+                -184.67352,
+                -46.426907,
+                168.41783,
+            ]
        )

        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-2
@@ -13,11 +13,7 @@ from diffusers.utils.testing_utils import (
    torch_device,
 )

-from ..test_pipelines_common import (
-    PipelineTesterMixin,
-    check_qkv_fusion_matches_attn_procs_length,
-    check_qkv_fusion_processors_exist,
-)
+from ..test_pipelines_common import PipelineTesterMixin


 class StableDiffusion3PipelineFastTests(unittest.TestCase, PipelineTesterMixin):
@@ -195,16 +191,7 @@ class StableDiffusion3PipelineFastTests(unittest.TestCase, PipelineTesterMixin):
        image = pipe(**inputs).images
        original_image_slice = image[0, -3:, -3:, -1]

-        # TODO (sayakpaul): will refactor this once `fuse_qkv_projections()` has been added
-        # to the pipeline level.
        pipe.transformer.fuse_qkv_projections()
-        assert check_qkv_fusion_processors_exist(
-            pipe.transformer
-        ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
-        assert check_qkv_fusion_matches_attn_procs_length(
-            pipe.transformer, pipe.transformer.original_attn_processors
-        ), "Something wrong with the attention processors concerning the fused QKV projections."
-
        inputs = self.get_dummy_inputs(device)
        image = pipe(**inputs).images
        image_slice_fused = image[0, -3:, -3:, -1]
@@ -133,7 +133,7 @@ class StableDiffusionImageVariationPipelineFastTests(
        image_slice = image[0, -3:, -3:, -1]

        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5348, 0.5924, 0.4798, 0.5237, 0.5741, 0.4651, 0.5344, 0.4942, 0.4851])
+        expected_slice = np.array([0.5239, 0.5723, 0.4796, 0.5049, 0.5550, 0.4685, 0.5329, 0.4891, 0.4921])

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3

@@ -153,7 +153,7 @@ class StableDiffusionImageVariationPipelineFastTests(
        image_slice = image[-1, -3:, -3:, -1]

        assert image.shape == (2, 64, 64, 3)
-        expected_slice = np.array([0.6647, 0.5557, 0.5723, 0.5567, 0.5869, 0.6044, 0.5502, 0.5439, 0.5189])
+        expected_slice = np.array([0.6892, 0.5637, 0.5836, 0.5771, 0.6254, 0.6409, 0.5580, 0.5569, 0.5289])

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3

@@ -205,7 +205,7 @@ class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase):
        image_slice = image[0, -3:, -3:, -1].flatten()

        assert image.shape == (1, 512, 512, 3)
-        expected_slice = np.array([0.5348, 0.5924, 0.4798, 0.5237, 0.5741, 0.4651, 0.5344, 0.4942, 0.4851])
+        expected_slice = np.array([0.8449, 0.9079, 0.7571, 0.7873, 0.8348, 0.7010, 0.6694, 0.6873, 0.6138])

        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
        assert max_diff < 1e-4
@@ -221,7 +221,7 @@ class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase):
                latents = latents.detach().cpu().numpy()
                assert latents.shape == (1, 4, 64, 64)
                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([0.5348, 0.5924, 0.4798, 0.5237, 0.5741, 0.4651, 0.5344, 0.4942, 0.4851])
+                expected_slice = np.array([-0.7974, -0.4343, -1.087, 0.04785, -1.327, 0.855, -2.148, -0.1725, 1.439])
                max_diff = numpy_cosine_similarity_distance(latents_slice.flatten(), expected_slice)

                assert max_diff < 1e-3
@@ -230,7 +230,7 @@ class StableDiffusionImageVariationPipelineSlowTests(unittest.TestCase):
                latents = latents.detach().cpu().numpy()
                assert latents.shape == (1, 4, 64, 64)
                latents_slice = latents[0, -3:, -3:, -1]
-                expected_slice = np.array([0.5348, 0.5924, 0.4798, 0.5237, 0.5741, 0.4651, 0.5344, 0.4942, 0.4851])
+                expected_slice = np.array([0.3232, 0.004883, 0.913, -1.084, 0.6143, -1.6875, -2.463, -0.439, -0.419])
                max_diff = numpy_cosine_similarity_distance(latents_slice.flatten(), expected_slice)

                assert max_diff < 1e-3
@@ -174,7 +174,7 @@ class StableDiffusionXLPipelineFastTests(
        image_slice = image[0, -3:, -3:, -1]

        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5388, 0.5452, 0.4694, 0.4583, 0.5253, 0.4832, 0.5288, 0.5035, 0.47])
+        expected_slice = np.array([0.5552, 0.5569, 0.4725, 0.4348, 0.4994, 0.4632, 0.5142, 0.5012, 0.47])

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

@@ -333,8 +333,7 @@ class StableDiffusionXLPipelineFastTests(
    def test_ip_adapter_single(self):
        expected_pipe_slice = None
        if torch_device == "cpu":
-            expected_pipe_slice = np.array([0.5388, 0.5452, 0.4694, 0.4583, 0.5253, 0.4832, 0.5288, 0.5035, 0.4766])
-
+            expected_pipe_slice = np.array([0.5552, 0.5569, 0.4725, 0.4348, 0.4994, 0.4632, 0.5142, 0.5012, 0.4700])
        return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)

    def test_attention_slicing_forward_pass(self):
@@ -295,9 +295,8 @@ class StableDiffusionXLAdapterPipelineFastTests(
            expected_pipe_slice = None
            if torch_device == "cpu":
                expected_pipe_slice = np.array(
-                    [0.5752, 0.6155, 0.4826, 0.5111, 0.5741, 0.4678, 0.5199, 0.5231, 0.4794]
+                    [0.5753, 0.6022, 0.4728, 0.4986, 0.5708, 0.4645, 0.5194, 0.5134, 0.4730]
                )
-
        return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)

    def test_stable_diffusion_adapter_default_case(self):
@@ -312,7 +311,9 @@ class StableDiffusionXLAdapterPipelineFastTests(
        image_slice = image[0, -3:, -3:, -1]

        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([00.5752, 0.6155, 0.4826, 0.5111, 0.5741, 0.4678, 0.5199, 0.5231, 0.4794])
+        expected_slice = np.array(
+            [0.5752919, 0.6022097, 0.4728038, 0.49861962, 0.57084894, 0.4644975, 0.5193715, 0.5133664, 0.4729858]
+        )
        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3

    @parameterized.expand(
@@ -445,14 +446,15 @@ class StableDiffusionXLMultiAdapterPipelineFastTests(
        image_slice = image[0, -3:, -3:, -1]

        assert image.shape == (1, 64, 64, 3)
-        expected_slice = np.array([0.5617, 0.6081, 0.4807, 0.5071, 0.5665, 0.4614, 0.5165, 0.5164, 0.4786])
+        expected_slice = np.array(
+            [0.5813032, 0.60995954, 0.47563356, 0.5056669, 0.57199144, 0.4631841, 0.5176794, 0.51252556, 0.47183886]
+        )
        assert np.abs(image_slice.flatten() - expected_slice).max() < 5e-3

    def test_ip_adapter_single(self):
        expected_pipe_slice = None
        if torch_device == "cpu":
-            expected_pipe_slice = np.array([0.5617, 0.6081, 0.4807, 0.5071, 0.5665, 0.4614, 0.5165, 0.5164, 0.4786])
-
+            expected_pipe_slice = np.array([0.5813, 0.6100, 0.4756, 0.5057, 0.5720, 0.4632, 0.5177, 0.5125, 0.4718])
        return super().test_ip_adapter_single(from_multi=True, expected_pipe_slice=expected_pipe_slice)

    def test_inference_batch_consistent(
@@ -313,8 +313,7 @@ class StableDiffusionXLImg2ImgPipelineFastTests(
    def test_ip_adapter_single(self):
        expected_pipe_slice = None
        if torch_device == "cpu":
-            expected_pipe_slice = np.array([0.5133, 0.4626, 0.4970, 0.6273, 0.5160, 0.6891, 0.6639, 0.5892, 0.5709])
-
+            expected_pipe_slice = np.array([0.5174, 0.4512, 0.5006, 0.6273, 0.5160, 0.6825, 0.6655, 0.5840, 0.5675])
        return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)

    def test_stable_diffusion_xl_img2img_tiny_autoencoder(self):
@@ -226,8 +226,7 @@ class StableDiffusionXLInpaintPipelineFastTests(
    def test_ip_adapter_single(self):
        expected_pipe_slice = None
        if torch_device == "cpu":
-            expected_pipe_slice = np.array([0.8274, 0.5538, 0.6141, 0.5843, 0.6865, 0.7082, 0.5861, 0.6123, 0.5344])
-
+            expected_pipe_slice = np.array([0.7971, 0.5371, 0.5973, 0.5642, 0.6689, 0.6894, 0.5770, 0.6063, 0.5261])
        return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)

    def test_components_function(self):
@@ -251,7 +250,7 @@ class StableDiffusionXLInpaintPipelineFastTests(

        assert image.shape == (1, 64, 64, 3)

-        expected_slice = np.array([0.8279, 0.5673, 0.6088, 0.6156, 0.6923, 0.7347, 0.6547, 0.6108, 0.5198])
+        expected_slice = np.array([0.8029, 0.5523, 0.5825, 0.6003, 0.6702, 0.7018, 0.6369, 0.5955, 0.5123])

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

@@ -386,7 +385,7 @@ class StableDiffusionXLInpaintPipelineFastTests(

        assert image.shape == (1, 64, 64, 3)

-        expected_slice = np.array([0.7540, 0.5231, 0.5833, 0.6217, 0.6339, 0.7067, 0.6507, 0.5672, 0.5030])
+        expected_slice = np.array([0.7045, 0.4838, 0.5454, 0.6270, 0.6168, 0.6717, 0.6484, 0.5681, 0.4922])

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2

@@ -182,7 +182,7 @@ class StableUnCLIPImg2ImgPipelineFastTests(
        image_slice = image[0, -3:, -3:, -1]

        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.4397, 0.7080, 0.5590, 0.4255, 0.7181, 0.5938, 0.4051, 0.3720, 0.5116])
+        expected_slice = np.array([0.3872, 0.7224, 0.5601, 0.4741, 0.6872, 0.5814, 0.4636, 0.3867, 0.5078])

        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3

@@ -146,7 +146,6 @@ class CustomPipeline(DiffusionPipeline):


 class DownloadTests(unittest.TestCase):
-    @unittest.skip("Flaky behaviour on CI. Re-enable after migrating to new runners")
    def test_one_request_upon_cached(self):
        # TODO: For some reason this test fails on MPS where no HEAD call is made.
        if torch_device == "mps":
@@ -192,7 +191,6 @@ class DownloadTests(unittest.TestCase):
            assert "scheduler" in os.listdir(cached_folder)
            assert "feature_extractor" in os.listdir(cached_folder)

-    @unittest.skip("Flaky behaviour on CI. Re-enable after migrating to new runners")
    def test_less_downloads_passed_object_calls(self):
        # TODO: For some reason this test fails on MPS where no HEAD call is made.
        if torch_device == "mps":
@@ -13,7 +13,6 @@ from typing import Any, Callable, Dict, Union
 import numpy as np
 import PIL.Image
 import torch
-import torch.nn as nn
 from huggingface_hub import ModelCard, delete_repo
 from huggingface_hub.utils import is_jinja_available
 from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
@@ -41,12 +40,7 @@ from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import logging
 from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available
-from diffusers.utils.testing_utils import (
-    CaptureLogger,
-    require_torch,
-    skip_mps,
-    torch_device,
-)
+from diffusers.utils.testing_utils import CaptureLogger, require_torch, skip_mps, torch_device

 from ..models.autoencoders.test_models_vae import (
    get_asym_autoencoder_kl_config,
@@ -73,17 +67,6 @@ def check_same_shape(tensor_list):
    return all(shape == shapes[0] for shape in shapes[1:])


-def check_qkv_fusion_matches_attn_procs_length(model, original_attn_processors):
-    current_attn_processors = model.attn_processors
-    return len(current_attn_processors) == len(original_attn_processors)
-
-
-def check_qkv_fusion_processors_exist(model):
-    current_attn_processors = model.attn_processors
-    proc_names = [v.__class__.__name__ for _, v in current_attn_processors.items()]
-    return all(p.startswith("Fused") for p in proc_names)
-
-
 class SDFunctionTesterMixin:
    """
    This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes.
@@ -213,19 +196,6 @@ class SDFunctionTesterMixin:
        original_image_slice = image[0, -3:, -3:, -1]

        pipe.fuse_qkv_projections()
-        for _, component in pipe.components.items():
-            if (
-                isinstance(component, nn.Module)
-                and hasattr(component, "original_attn_processors")
-                and component.original_attn_processors is not None
-            ):
-                assert check_qkv_fusion_processors_exist(
-                    component
-                ), "Something wrong with the fused attention processors. Expected all the attention processors to be fused."
-                assert check_qkv_fusion_matches_attn_procs_length(
-                    component, component.original_attn_processors
-                ), "Something wrong with the attention processors concerning the fused QKV projections."
-
        inputs = self.get_dummy_inputs(device)
        inputs["return_dict"] = False
        image_fused = pipe(**inputs)[0]
@@ -168,12 +168,8 @@ class TextToVideoZeroSDXLPipelineFastTests(PipelineTesterMixin, PipelineFromPipe
        first_frame_slice = result[0, -3:, -3:, -1]
        last_frame_slice = result[-1, -3:, -3:, 0]

-        expected_slice1 = np.array(
-            [0.6008109, 0.73051643, 0.51778656, 0.55817354, 0.45222935, 0.45998418, 0.57017255, 0.54874814, 0.47078788]
-        )
-        expected_slice2 = np.array(
-            [0.6011751, 0.47420046, 0.41660714, 0.6472957, 0.41261768, 0.5438129, 0.7401535, 0.6756011, 0.53652245]
-        )
+        expected_slice1 = np.array([0.48, 0.58, 0.53, 0.59, 0.50, 0.44, 0.60, 0.65, 0.52])
+        expected_slice2 = np.array([0.66, 0.49, 0.40, 0.70, 0.47, 0.51, 0.73, 0.65, 0.52])

        assert np.abs(first_frame_slice.flatten() - expected_slice1).max() < 1e-2
        assert np.abs(last_frame_slice.flatten() - expected_slice2).max() < 1e-2
@@ -76,7 +76,7 @@ def main(correct, fail=None):

    done_tests = defaultdict(int)
    for line in correct_lines:
-        file, class_name, test_name, correct_line = line.split("::")
+        file, class_name, test_name, correct_line = line.split(";")
        if test_failures is None or "::".join([file, class_name, test_name]) in test_failures:
            overwrite_file(file, class_name, test_name, correct_line, done_tests)