update

2024-07-26 07:30:35 +00:00 · 2024-07-26 07:29:56 +00:00 · 2024-07-26 03:48:37 +00:00
83 changed files with 189 additions and 8948 deletions
@@ -20,8 +20,7 @@ env:

 jobs:
  test-build-docker-images:
-    runs-on:
-      group: aws-general-8-plus-cache
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    if: github.event_name == 'pull_request'
    steps:
      - name: Set up Docker Buildx
@@ -51,8 +50,7 @@ jobs:
        if: steps.file_changes.outputs.all != ''

  build-and-push-docker-images:
-    runs-on:
-      group: aws-general-8-plus-cache
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    if: github.event_name != 'pull_request'

    permissions:
@@ -100,4 +98,4 @@ jobs:
          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
          title: "🤗 Results of the ${{ matrix.image-name }} Docker Image build"
          status: ${{ job.status }}
-          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
@@ -19,8 +19,7 @@ env:
 jobs:
  setup_torch_cuda_pipeline_matrix:
    name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on:
-      group: aws-general-8-plus-cache
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    container:
      image: diffusers/diffusers-pytorch-cpu
    outputs:
@@ -56,8 +55,7 @@ jobs:
      max-parallel: 8
      matrix:
        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
      options: --shm-size "16gb" --ipc host --gpus 0
@@ -107,8 +105,7 @@ jobs:

  run_nightly_tests_for_other_torch_modules:
    name: Nightly Torch CUDA Tests
-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
      options: --shm-size "16gb" --ipc host --gpus 0
@@ -116,7 +113,6 @@ jobs:
      run:
        shell: bash
    strategy:
-      max-parallel: 2
      matrix:
        module: [models, schedulers, lora, others, single_file, examples]
    steps:
@@ -237,8 +233,7 @@ jobs:

  run_nightly_onnx_tests:
    name: Nightly ONNXRuntime CUDA tests on Ubuntu
-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-onnxruntime-cuda
      options: --gpus 0 --shm-size "16gb" --ipc host
@@ -15,8 +15,7 @@ concurrency:
 jobs:
  setup_pr_tests:
    name: Setup PR Tests
-    runs-on:
-      group: aws-general-8-plus-cache
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    container:
      image: diffusers/diffusers-pytorch-cpu
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -74,8 +73,7 @@ jobs:
      max-parallel: 2
      matrix:
        modules: ${{ fromJson(needs.setup_pr_tests.outputs.matrix) }}
-    runs-on:
-      group: aws-general-8-plus-cache
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    container:
      image: diffusers/diffusers-pytorch-cpu
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -125,13 +123,12 @@ jobs:
        config:
          - name: Hub tests for models, schedulers, and pipelines
            framework: hub_tests_pytorch
-            runner: aws-general-8-plus-cache
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_hub

    name: ${{ matrix.config.name }}
-    runs-on:
-      group: ${{ matrix.config.runner }}
+    runs-on: ${{ matrix.config.runner }}
    container:
      image: ${{ matrix.config.image }}
      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
@@ -71,8 +71,7 @@ jobs:

    name: LoRA - ${{ matrix.lib-versions }}

-    runs-on:
-      group: aws-general-8-plus-cache
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]

    container:
      image: diffusers/diffusers-pytorch-cpu
@@ -129,4 +128,4 @@ jobs:
      uses: actions/upload-artifact@v2
      with:
        name: pr_${{ matrix.config.report }}_test_reports
-        path: reports
+        path: reports
@@ -77,29 +77,28 @@ jobs:
        config:
          - name: Fast PyTorch Pipeline CPU tests
            framework: pytorch_pipelines
-            runner: aws-highmemory-32-plus
+            runner: [ self-hosted, intel-cpu, 32-cpu, 256-ram, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu_pipelines
          - name: Fast PyTorch Models & Schedulers CPU tests
            framework: pytorch_models
-            runner: aws-general-8-plus-cache
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu_models_schedulers
          - name: Fast Flax CPU tests
            framework: flax
-            runner: aws-general-8-plus-cache
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-flax-cpu
            report: flax_cpu
          - name: PyTorch Example CPU tests
            framework: pytorch_examples
-            runner: aws-general-8-plus-cache
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_example_cpu

    name: ${{ matrix.config.name }}

-    runs-on:
-      group: ${{ matrix.config.runner }}
+    runs-on: ${{ matrix.config.runner }}

    container:
      image: ${{ matrix.config.image }}
@@ -181,8 +180,7 @@ jobs:
        config:
          - name: Hub tests for models, schedulers, and pipelines
            framework: hub_tests_pytorch
-            runner:
-              group: aws-general-8-plus-cache
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_hub

@@ -19,8 +19,7 @@ env:
 jobs:
  setup_torch_cuda_pipeline_matrix:
    name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on:
-      group: aws-general-8-plus-cache
+    runs-on: [ self-hosted, intel-cpu, 8-cpu, ci ]
    container:
      image: diffusers/diffusers-pytorch-cpu
    outputs:
@@ -58,8 +57,7 @@ jobs:
      max-parallel: 8
      matrix:
        module: ${{ fromJson(needs.setup_torch_cuda_pipeline_matrix.outputs.pipeline_test_matrix) }}
-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
      options: --shm-size "16gb" --ipc host --gpus 0
@@ -103,8 +101,7 @@ jobs:

  torch_cuda_tests:
    name: Torch CUDA Tests
-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-pytorch-cuda
      options: --shm-size "16gb" --ipc host --gpus 0
@@ -204,8 +201,7 @@ jobs:

  onnx_cuda_tests:
    name: ONNX CUDA Tests
-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: diffusers/diffusers-onnxruntime-cuda
      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --gpus 0
@@ -253,8 +249,7 @@ jobs:
  run_torch_compile_tests:
    name: PyTorch Compile CUDA tests

-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]

    container:
      image: diffusers/diffusers-pytorch-compile-cuda
@@ -296,8 +291,7 @@ jobs:
  run_xformers_tests:
    name: PyTorch xformers CUDA tests

-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]

    container:
      image: diffusers/diffusers-pytorch-xformers-cuda
@@ -338,8 +332,7 @@ jobs:
  run_examples_tests:
    name: Examples PyTorch CUDA tests on Ubuntu

-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]

    container:
      image: diffusers/diffusers-pytorch-cuda
@@ -29,29 +29,28 @@ jobs:
        config:
          - name: Fast PyTorch CPU tests on Ubuntu
            framework: pytorch
-            runner: aws-general-8-plus-cache
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_cpu
          - name: Fast Flax CPU tests on Ubuntu
            framework: flax
-            runner: aws-general-8-plus-cache
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-flax-cpu
            report: flax_cpu
          - name: Fast ONNXRuntime CPU tests on Ubuntu
            framework: onnxruntime
-            runner: aws-general-8-plus-cache
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-onnxruntime-cpu
            report: onnx_cpu
          - name: PyTorch Example CPU tests on Ubuntu
            framework: pytorch_examples
-            runner: aws-general-8-plus-cache
+            runner: [ self-hosted, intel-cpu, 8-cpu, ci ]
            image: diffusers/diffusers-pytorch-cpu
            report: torch_example_cpu

    name: ${{ matrix.config.name }}

-    runs-on:
-      group: ${{ matrix.config.runner }}
+    runs-on: ${{ matrix.config.runner }}

    container:
      image: ${{ matrix.config.image }}
@@ -26,8 +26,7 @@ env:
 jobs:
  run_tests:
    name: "Run a test on our runner from a PR"
-    runs-on:
-      group: aws-g4dn-2xlarge
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
    container:
      image: ${{ github.event.inputs.docker_image }}
      options: --gpus 0 --privileged --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@@ -71,4 +70,4 @@ jobs:
        env:
            PY_TEST: ${{ github.event.inputs.test }}
        run: |
-          pytest "$PY_TEST"
+          pytest "$PY_TEST"
@@ -19,8 +19,7 @@ env:
 jobs:
  ssh_runner:
    name: "SSH"
-    runs-on:
-      group: aws-highmemory-32-plus
+    runs-on: [self-hosted, intel-cpu, 32-cpu, 256-ram, ci]
    container:
      image: ${{ github.event.inputs.docker_image }}
      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --privileged
@@ -22,8 +22,7 @@ env:
 jobs:
  ssh_runner:
    name: "SSH"
-    runs-on:
-      group: "${{ github.event.inputs.runner_type }}"
+    runs-on: [single-gpu, nvidia-gpu, "${{ github.event.inputs.runner_type }}", ci]
    container:
      image: ${{ github.event.inputs.docker_image }}
      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface/diffusers:/mnt/cache/ --gpus 0 --privileged
@@ -239,8 +239,6 @@
      title: AsymmetricAutoencoderKL
    - local: api/models/autoencoder_tiny
      title: Tiny AutoEncoder
-    - local: api/models/autoencoder_oobleck
-      title: Oobleck AutoEncoder
    - local: api/models/consistency_decoder_vae
      title: ConsistencyDecoderVAE
    - local: api/models/transformer2d
@@ -261,8 +259,6 @@
      title: TransformerTemporalModel
    - local: api/models/sd3_transformer2d
      title: SD3Transformer2DModel
-    - local: api/models/stable_audio_transformer
-      title: StableAudioDiTModel
    - local: api/models/prior_transformer
      title: PriorTransformer
    - local: api/models/controlnet
@@ -271,8 +267,6 @@
      title: HunyuanDiT2DControlNetModel
    - local: api/models/controlnet_sd3
      title: SD3ControlNetModel
-    - local: api/models/controlnet_sparsectrl
-      title: SparseControlNetModel
    title: Models
  - isExpanded: false
    sections:
@@ -366,8 +360,6 @@
      title: Semantic Guidance
    - local: api/pipelines/shap_e
      title: Shap-E
-    - local: api/pipelines/stable_audio
-      title: Stable Audio
    - local: api/pipelines/stable_cascade
      title: Stable Cascade
    - sections:
@@ -431,8 +423,6 @@
      title: CMStochasticIterativeScheduler
    - local: api/schedulers/consistency_decoder
      title: ConsistencyDecoderScheduler
-    - local: api/schedulers/cosine_dpm
-      title: CosineDPMSolverMultistepScheduler
    - local: api/schedulers/ddim_inverse
      title: DDIMInverseScheduler
    - local: api/schedulers/ddim
@@ -1,38 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# AutoencoderOobleck
-
-The Oobleck variational autoencoder (VAE) model with KL loss was introduced in [Stability-AI/stable-audio-tools](https://github.com/Stability-AI/stable-audio-tools) and [Stable Audio Open](https://huggingface.co/papers/2407.14358) by Stability AI. The model is used in 🤗 Diffusers to encode audio waveforms into latents and to decode latent representations into audio waveforms.
-
-The abstract from the paper is:
-
-*Open generative models are vitally important for the community, allowing for fine-tunes and serving as baselines when presenting new models. However, most current text-to-audio models are private and not accessible for artists and researchers to build upon. Here we describe the architecture and training process of a new open-weights text-to-audio model trained with Creative Commons data. Our evaluation shows that the model's performance is competitive with the state-of-the-art across various metrics. Notably, the reported FDopenl3 results (measuring the realism of the generations) showcase its potential for high-quality stereo sound synthesis at 44.1kHz.*
-
-## AutoencoderOobleck
-
-[[autodoc]] AutoencoderOobleck
-    - decode
-    - encode
-    - all
-
-## OobleckDecoderOutput
-
-[[autodoc]] models.autoencoders.autoencoder_oobleck.OobleckDecoderOutput
-
-## OobleckDecoderOutput
-
-[[autodoc]] models.autoencoders.autoencoder_oobleck.OobleckDecoderOutput
-
-## AutoencoderOobleckOutput
-
-[[autodoc]] models.autoencoders.autoencoder_oobleck.AutoencoderOobleckOutput
@@ -1,46 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# SparseControlNetModel
-
-SparseControlNetModel is an implementation of ControlNet for [AnimateDiff](https://arxiv.org/abs/2307.04725).
-
-ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala.
-
-The SparseCtrl version of ControlNet was introduced in [SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models](https://arxiv.org/abs/2311.16933) for achieving controlled generation in text-to-video diffusion models by Yuwei Guo, Ceyuan Yang, Anyi Rao, Maneesh Agrawala, Dahua Lin, and Bo Dai.
-
-The abstract from the paper is:
-
-*The development of text-to-video (T2V), i.e., generating videos with a given text prompt, has been significantly advanced in recent years. However, relying solely on text prompts often results in ambiguous frame composition due to spatial uncertainty. The research community thus leverages the dense structure signals, e.g., per-frame depth/edge sequences, to enhance controllability, whose collection accordingly increases the burden of inference. In this work, we present SparseCtrl to enable flexible structure control with temporally sparse signals, requiring only one or a few inputs, as shown in Figure 1. It incorporates an additional condition encoder to process these sparse signals while leaving the pre-trained T2V model untouched. The proposed approach is compatible with various modalities, including sketches, depth maps, and RGB images, providing more practical control for video generation and promoting applications such as storyboarding, depth rendering, keyframe animation, and interpolation. Extensive experiments demonstrate the generalization of SparseCtrl on both original and personalized T2V generators. Codes and models will be publicly available at [this https URL](https://guoyww.github.io/projects/SparseCtrl).*
-
-## Example for loading SparseControlNetModel
-
-```python
-import torch
-from diffusers import SparseControlNetModel
-
-# fp32 variant in float16
-# 1. Scribble checkpoint
-controlnet = SparseControlNetModel.from_pretrained("guoyww/animatediff-sparsectrl-scribble", torch_dtype=torch.float16)
-
-# 2. RGB checkpoint
-controlnet = SparseControlNetModel.from_pretrained("guoyww/animatediff-sparsectrl-rgb", torch_dtype=torch.float16)
-
-# For loading fp16 variant, pass `variant="fp16"` as an additional parameter
-```
-
-## SparseControlNetModel
-
-[[autodoc]] SparseControlNetModel
-
-## SparseControlNetOutput
-
-[[autodoc]] models.controlnet_sparsectrl.SparseControlNetOutput
@@ -1,19 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# StableAudioDiTModel
-
-A Transformer model for audio waveforms from [Stable Audio Open](https://huggingface.co/papers/2407.14358).
-
-## StableAudioDiTModel
-
-[[autodoc]] StableAudioDiTModel
@@ -25,9 +25,6 @@ The abstract of the paper is the following:
 | Pipeline | Tasks | Demo
 |---|---|:---:|
 | [AnimateDiffPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff.py) | *Text-to-Video Generation with AnimateDiff* |
-| [AnimateDiffControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_controlnet.py) | *Controlled Video-to-Video Generation with AnimateDiff using ControlNet* |
-| [AnimateDiffSparseControlNetPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_sparsectrl.py) | *Controlled Video-to-Video Generation with AnimateDiff using SparseCtrl* |
-| [AnimateDiffSDXLPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_sdxl.py) | *Video-to-Video Generation with AnimateDiff* |
 | [AnimateDiffVideoToVideoPipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/animatediff/pipeline_animatediff_video2video.py) | *Video-to-Video Generation with AnimateDiff* |

 ## Available checkpoints
@@ -103,266 +100,6 @@ AnimateDiff tends to work better with finetuned Stable Diffusion models. If you

 </Tip>

-### AnimateDiffControlNetPipeline
-
-AnimateDiff can also be used with ControlNets ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models](https://huggingface.co/papers/2302.05543) by Lvmin Zhang, Anyi Rao, and Maneesh Agrawala. With a ControlNet model, you can provide an additional control image to condition and control Stable Diffusion generation. For example, if you provide depth maps, the ControlNet model generates a video that'll preserve the spatial information from the depth maps. It is a more flexible and accurate way to control the video generation process.
-
-```python
-import torch
-from diffusers import AnimateDiffControlNetPipeline, AutoencoderKL, ControlNetModel, MotionAdapter, LCMScheduler
-from diffusers.utils import export_to_gif, load_video
-
-# Additionally, you will need a preprocess videos before they can be used with the ControlNet
-# HF maintains just the right package for it: `pip install controlnet_aux`
-from controlnet_aux.processor import ZoeDetector
-
-# Download controlnets from https://huggingface.co/lllyasviel/ControlNet-v1-1 to use .from_single_file
-# Download Diffusers-format controlnets, such as https://huggingface.co/lllyasviel/sd-controlnet-depth, to use .from_pretrained()
-controlnet = ControlNetModel.from_single_file("control_v11f1p_sd15_depth.pth", torch_dtype=torch.float16)
-
-# We use AnimateLCM for this example but one can use the original motion adapters as well (for example, https://huggingface.co/guoyww/animatediff-motion-adapter-v1-5-3)
-motion_adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")
-
-vae = AutoencoderKL.from_pretrained("stabilityai/sd-vae-ft-mse", torch_dtype=torch.float16)
-pipe: AnimateDiffControlNetPipeline = AnimateDiffControlNetPipeline.from_pretrained(
-    "SG161222/Realistic_Vision_V5.1_noVAE",
-    motion_adapter=motion_adapter,
-    controlnet=controlnet,
-    vae=vae,
-).to(device="cuda", dtype=torch.float16)
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
-pipe.load_lora_weights("wangfuyun/AnimateLCM", weight_name="AnimateLCM_sd15_t2v_lora.safetensors", adapter_name="lcm-lora")
-pipe.set_adapters(["lcm-lora"], [0.8])
-
-depth_detector = ZoeDetector.from_pretrained("lllyasviel/Annotators").to("cuda")
-video = load_video("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif")
-conditioning_frames = []
-
-with pipe.progress_bar(total=len(video)) as progress_bar:
-    for frame in video:
-        conditioning_frames.append(depth_detector(frame))
-        progress_bar.update()
-
-prompt = "a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality"
-negative_prompt = "bad quality, worst quality"
-
-video = pipe(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    num_frames=len(video),
-    num_inference_steps=10,
-    guidance_scale=2.0,
-    conditioning_frames=conditioning_frames,
-    generator=torch.Generator().manual_seed(42),
-).frames[0]
-
-export_to_gif(video, "animatediff_controlnet.gif", fps=8)
-```
-
-Here are some sample outputs:
-
-<table align="center">
-    <tr>
-      <th align="center">Source Video</th>
-      <th align="center">Output Video</th>
-    </tr>
-    <tr>
-        <td align="center">
-          raccoon playing a guitar
-          <br />
-          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-vid2vid-input-1.gif" alt="racoon playing a guitar" />
-        </td>
-        <td align="center">
-          a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality
-          <br/>
-          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-controlnet-output.gif" alt="a panda, playing a guitar, sitting in a pink boat, in the ocean, mountains in background, realistic, high quality" />
-        </td>
-    </tr>
-</table>
-
-### AnimateDiffSparseControlNetPipeline
-
-[SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion Models](https://arxiv.org/abs/2311.16933) for achieving controlled generation in text-to-video diffusion models by Yuwei Guo, Ceyuan Yang, Anyi Rao, Maneesh Agrawala, Dahua Lin, and Bo Dai.
-
-The abstract from the paper is:
-
-*The development of text-to-video (T2V), i.e., generating videos with a given text prompt, has been significantly advanced in recent years. However, relying solely on text prompts often results in ambiguous frame composition due to spatial uncertainty. The research community thus leverages the dense structure signals, e.g., per-frame depth/edge sequences, to enhance controllability, whose collection accordingly increases the burden of inference. In this work, we present SparseCtrl to enable flexible structure control with temporally sparse signals, requiring only one or a few inputs, as shown in Figure 1. It incorporates an additional condition encoder to process these sparse signals while leaving the pre-trained T2V model untouched. The proposed approach is compatible with various modalities, including sketches, depth maps, and RGB images, providing more practical control for video generation and promoting applications such as storyboarding, depth rendering, keyframe animation, and interpolation. Extensive experiments demonstrate the generalization of SparseCtrl on both original and personalized T2V generators. Codes and models will be publicly available at [this https URL](https://guoyww.github.io/projects/SparseCtrl).*
-
-SparseCtrl introduces the following checkpoints for controlled text-to-video generation:
-
- [SparseCtrl Scribble](https://huggingface.co/guoyww/animatediff-sparsectrl-scribble)
- [SparseCtrl RGB](https://huggingface.co/guoyww/animatediff-sparsectrl-rgb)
-
-#### Using SparseCtrl Scribble
-
-```python
-import torch
-
-from diffusers import AnimateDiffSparseControlNetPipeline
-from diffusers.models import AutoencoderKL, MotionAdapter, SparseControlNetModel
-from diffusers.schedulers import DPMSolverMultistepScheduler
-from diffusers.utils import export_to_gif, load_image
-
-
-model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
-motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3"
-controlnet_id = "guoyww/animatediff-sparsectrl-scribble"
-lora_adapter_id = "guoyww/animatediff-motion-lora-v1-5-3"
-vae_id = "stabilityai/sd-vae-ft-mse"
-device = "cuda"
-
-motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id, torch_dtype=torch.float16).to(device)
-controlnet = SparseControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16).to(device)
-vae = AutoencoderKL.from_pretrained(vae_id, torch_dtype=torch.float16).to(device)
-scheduler = DPMSolverMultistepScheduler.from_pretrained(
-    model_id,
-    subfolder="scheduler",
-    beta_schedule="linear",
-    algorithm_type="dpmsolver++",
-    use_karras_sigmas=True,
-)
-pipe = AnimateDiffSparseControlNetPipeline.from_pretrained(
-    model_id,
-    motion_adapter=motion_adapter,
-    controlnet=controlnet,
-    vae=vae,
-    scheduler=scheduler,
-    torch_dtype=torch.float16,
-).to(device)
-pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora")
-pipe.fuse_lora(lora_scale=1.0)
-
-prompt = "an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality"
-negative_prompt = "low quality, worst quality, letterboxed"
-
-image_files = [
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-1.png",
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-2.png",
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-3.png"
-]
-condition_frame_indices = [0, 8, 15]
-conditioning_frames = [load_image(img_file) for img_file in image_files]
-
-video = pipe(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    num_inference_steps=25,
-    conditioning_frames=conditioning_frames,
-    controlnet_conditioning_scale=1.0,
-    controlnet_frame_indices=condition_frame_indices,
-    generator=torch.Generator().manual_seed(1337),
-).frames[0]
-export_to_gif(video, "output.gif")
-```
-
-Here are some sample outputs:
-
-<table align="center">
-    <tr>
-        <center>
-          <b>an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality</b>
-        </center>
-    </tr>
-    <tr>
-        <td>
-          <center>
-            <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-1.png" alt="scribble-1" />
-          </center>
-        </td>
-        <td>
-          <center>
-            <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-2.png" alt="scribble-2" />
-          </center>
-        </td>
-        <td>
-          <center>
-            <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-scribble-3.png" alt="scribble-3" />
-          </center>
-        </td>
-    </tr>
-    <tr>
-        <td colspan=3>
-          <center>
-            <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-sparsectrl-scribble-results.gif" alt="an aerial view of a cyberpunk city, night time, neon lights, masterpiece, high quality" />
-          </center>
-        </td>
-    </tr>
-</table>
-
-#### Using SparseCtrl RGB
-
-```python
-import torch
-
-from diffusers import AnimateDiffSparseControlNetPipeline
-from diffusers.models import AutoencoderKL, MotionAdapter, SparseControlNetModel
-from diffusers.schedulers import DPMSolverMultistepScheduler
-from diffusers.utils import export_to_gif, load_image
-
-
-model_id = "SG161222/Realistic_Vision_V5.1_noVAE"
-motion_adapter_id = "guoyww/animatediff-motion-adapter-v1-5-3"
-controlnet_id = "guoyww/animatediff-sparsectrl-rgb"
-lora_adapter_id = "guoyww/animatediff-motion-lora-v1-5-3"
-vae_id = "stabilityai/sd-vae-ft-mse"
-device = "cuda"
-
-motion_adapter = MotionAdapter.from_pretrained(motion_adapter_id, torch_dtype=torch.float16).to(device)
-controlnet = SparseControlNetModel.from_pretrained(controlnet_id, torch_dtype=torch.float16).to(device)
-vae = AutoencoderKL.from_pretrained(vae_id, torch_dtype=torch.float16).to(device)
-scheduler = DPMSolverMultistepScheduler.from_pretrained(
-    model_id,
-    subfolder="scheduler",
-    beta_schedule="linear",
-    algorithm_type="dpmsolver++",
-    use_karras_sigmas=True,
-)
-pipe = AnimateDiffSparseControlNetPipeline.from_pretrained(
-    model_id,
-    motion_adapter=motion_adapter,
-    controlnet=controlnet,
-    vae=vae,
-    scheduler=scheduler,
-    torch_dtype=torch.float16,
-).to(device)
-pipe.load_lora_weights(lora_adapter_id, adapter_name="motion_lora")
-
-image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-firework.png")
-
-video = pipe(
-    prompt="closeup face photo of man in black clothes, night city street, bokeh, fireworks in background",
-    negative_prompt="low quality, worst quality",
-    num_inference_steps=25,
-    conditioning_frames=image,
-    controlnet_frame_indices=[0],
-    controlnet_conditioning_scale=1.0,
-    generator=torch.Generator().manual_seed(42),
-).frames[0]
-export_to_gif(video, "output.gif")
-```
-
-Here are some sample outputs:
-
-<table align="center">
-    <tr>
-        <center>
-          <b>closeup face photo of man in black clothes, night city street, bokeh, fireworks in background</b>
-        </center>
-    </tr>
-    <tr>
-        <td>
-          <center>
-            <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-firework.png" alt="closeup face photo of man in black clothes, night city street, bokeh, fireworks in background" />
-          </center>
-        </td>
-        <td>
-          <center>
-            <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-sparsectrl-rgb-result.gif" alt="closeup face photo of man in black clothes, night city street, bokeh, fireworks in background" />
-          </center>
-        </td>
-    </tr>
-</table>
-
 ### AnimateDiffSDXLPipeline

 AnimateDiff can also be used with SDXL models. This is currently an experimental feature as only a beta release of the motion adapter checkpoint is available.
@@ -834,6 +571,7 @@ ckpt_path = "https://huggingface.co/Lightricks/LongAnimateDiff/blob/main/lt_long

 adapter = MotionAdapter.from_single_file(ckpt_path, torch_dtype=torch.float16)
 pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter)
+
 ```

 ## AnimateDiffPipeline
@@ -842,18 +580,6 @@ pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapt
  - all
  - __call__

-## AnimateDiffControlNetPipeline
-
-[[autodoc]] AnimateDiffControlNetPipeline
-  - all
-  - __call__
-
-## AnimateDiffSparseControlNetPipeline
-
-[[autodoc]] AnimateDiffSparseControlNetPipeline
-  - all
-  - __call__
-
 ## AnimateDiffSDXLPipeline

 [[autodoc]] AnimateDiffSDXLPipeline
@@ -41,64 +41,6 @@ image = pipe(
 image.save("kolors_sample.png")
 ```

-### IP Adapter
-
-Kolors needs a different IP Adapter to work, and it uses [Openai-CLIP-336](https://huggingface.co/openai/clip-vit-large-patch14-336) as an image encoder.
-
-<Tip>
-
-Using an IP Adapter with Kolors requires more than 24GB of VRAM. To use it, we recommend using [`~DiffusionPipeline.enable_model_cpu_offload`] on consumer GPUs.
-
-</Tip>
-
-<Tip>
-
-While Kolors is integrated in Diffusers, you need to load the image encoder from a revision to use the safetensor files. You can still use the main branch of the original repository if you're comfortable loading pickle checkpoints.
-
-</Tip>
-
-```python
-import torch
-from transformers import CLIPVisionModelWithProjection
-
-from diffusers import DPMSolverMultistepScheduler, KolorsPipeline
-from diffusers.utils import load_image
-
-image_encoder = CLIPVisionModelWithProjection.from_pretrained(
-    "Kwai-Kolors/Kolors-IP-Adapter-Plus",
-    subfolder="image_encoder",
-    low_cpu_mem_usage=True,
-    torch_dtype=torch.float16,
-    revision="refs/pr/4",
-)
-
-pipe = KolorsPipeline.from_pretrained(
-    "Kwai-Kolors/Kolors-diffusers", image_encoder=image_encoder, torch_dtype=torch.float16, variant="fp16"
-).to("cuda")
-pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config, use_karras_sigmas=True)
-
-pipe.load_ip_adapter(
-    "Kwai-Kolors/Kolors-IP-Adapter-Plus",
-    subfolder="",
-    weight_name="ip_adapter_plus_general.safetensors",
-    revision="refs/pr/4",
-    image_encoder_folder=None,
-)
-pipe.enable_model_cpu_offload()
-
-ipa_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/kolors/cat_square.png")
-
-image = pipe(
-    prompt="best quality, high quality",
-    negative_prompt="",
-    guidance_scale=6.5,
-    num_inference_steps=25,
-    ip_adapter_image=ipa_image,
-).images[0]
-
-image.save("kolors_ipa_sample.png")
-```
-
 ## KolorsPipeline

 [[autodoc]] KolorsPipeline
@@ -24,8 +24,6 @@ The abstract from the paper is:

 **Highlights**: Latte is a latent diffusion transformer proposed as a backbone for modeling different modalities (trained for text-to-video generation here). It achieves state-of-the-art performance across four standard video benchmarks - [FaceForensics](https://arxiv.org/abs/1803.09179), [SkyTimelapse](https://arxiv.org/abs/1709.07592), [UCF101](https://arxiv.org/abs/1212.0402) and [Taichi-HD](https://arxiv.org/abs/2003.00196). To prepare and download the datasets for evaluation, please refer to [this https URL](https://github.com/Vchitect/Latte/blob/main/docs/datasets_evaluation.md).

-This pipeline was contributed by [maxin-cn](https://github.com/maxin-cn). The original codebase can be found [here](https://github.com/Vchitect/Latte). The original weights can be found under [hf.co/maxin-cn](https://huggingface.co/maxin-cn).
-
 <Tip>

 Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
@@ -43,8 +43,6 @@ Lumina-T2X has the following components:
 * It uses a Flow-based Large Diffusion Transformer as the backbone
 * It supports different any modalities with one backbone and corresponding encoder, decoder.

-This pipeline was contributed by [PommesPeter](https://github.com/PommesPeter). The original codebase can be found [here](https://github.com/Alpha-VLLM/Lumina-T2X). The original weights can be found under [hf.co/Alpha-VLLM](https://huggingface.co/Alpha-VLLM).
-
 <Tip>

 Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
@@ -71,7 +71,6 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [Semantic Guidance](semantic_stable_diffusion) | text2image |
 | [Shap-E](shap_e) | text-to-3D, image-to-3D |
 | [Spectrogram Diffusion](spectrogram_diffusion) |  |
-| [Stable Audio](stable_audio) | text2audio |
 | [Stable Diffusion](stable_diffusion/overview) | text2image, image2image, depth2image, inpainting, image variation, latent upscaler, super-resolution |
 | [Stable Diffusion Model Editing](model_editing) | model editing |
 | [Stable Diffusion XL](stable_diffusion/stable_diffusion_xl) | text2image, image2image, inpainting |
@@ -1,42 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Stable Audio
-
-Stable Audio was proposed in [Stable Audio Open](https://arxiv.org/abs/2407.14358) by Zach Evans et al. . it takes a text prompt as input and predicts the corresponding sound or music sample.
-
-Stable Audio Open generates variable-length (up to 47s) stereo audio at 44.1kHz from text prompts. It comprises three components: an autoencoder that compresses waveforms into a manageable sequence length, a T5-based text embedding for text conditioning, and a transformer-based diffusion (DiT) model that operates in the latent space of the autoencoder.
-
-Stable Audio is trained on a corpus of around 48k audio recordings, where around 47k are from Freesound and the rest are from the Free Music Archive (FMA). All audio files are licensed under CC0, CC BY, or CC Sampling+. This data is used to train the autoencoder and the DiT. 
-
-The abstract of the paper is the following:
-*Open generative models are vitally important for the community, allowing for fine-tunes and serving as baselines when presenting new models. However, most current text-to-audio models are private and not accessible for artists and researchers to build upon. Here we describe the architecture and training process of a new open-weights text-to-audio model trained with Creative Commons data. Our evaluation shows that the model's performance is competitive with the state-of-the-art across various metrics. Notably, the reported FDopenl3 results (measuring the realism of the generations) showcase its potential for high-quality stereo sound synthesis at 44.1kHz.*
-
-This pipeline was contributed by [Yoach Lacombe](https://huggingface.co/ylacombe). The original codebase can be found at [Stability-AI/stable-audio-tool](https://github.com/Stability-AI/stable-audio-tool).
-
-## Tips
-
-When constructing a prompt, keep in mind:
-
-* Descriptive prompt inputs work best; use adjectives to describe the sound (for example, "high quality" or "clear") and make the prompt context specific where possible (e.g. "melodic techno with a fast beat and synths" works better than "techno").
-* Using a *negative prompt* can significantly improve the quality of the generated audio. Try using a negative prompt of "low quality, average quality".
-
-During inference:
-
-* The _quality_ of the generated audio sample can be controlled by the `num_inference_steps` argument; higher steps give higher quality audio at the expense of slower inference.
-* Multiple waveforms can be generated in one go: set `num_waveforms_per_prompt` to a value greater than 1 to enable. Automatic scoring will be performed between the generated waveforms and prompt text, and the audios ranked from best to worst accordingly.
-
-
-## StableAudioPipeline
-[[autodoc]] StableAudioPipeline
-	- all
-	- __call__
@@ -1,24 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# CosineDPMSolverMultistepScheduler
-
-The [`CosineDPMSolverMultistepScheduler`] is a variant of [`DPMSolverMultistepScheduler`] with cosine schedule, proposed by Nichol and Dhariwal (2021).
-It is being used in the [Stable Audio Open](https://arxiv.org/abs/2407.14358) paper and the [Stability-AI/stable-audio-tool](https://github.com/Stability-AI/stable-audio-tool) codebase.
-
-This scheduler was contributed by [Yoach Lacombe](https://huggingface.co/ylacombe).
-
-## CosineDPMSolverMultistepScheduler
-[[autodoc]] CosineDPMSolverMultistepScheduler
-
-## SchedulerOutput
-[[autodoc]] schedulers.scheduling_utils.SchedulerOutput
@@ -340,8 +340,8 @@ Now you can wrap all these components together in a training loop with 🤗 Acce
 ...                 loss = F.mse_loss(noise_pred, noise)
 ...                 accelerator.backward(loss)

-...                 if accelerator.sync_gradients:
-...                     accelerator.clip_grad_norm_(model.parameters(), 1.0)
+...             if (step + 1) % config.gradient_accumulation_steps == 0:
+...                 accelerator.clip_grad_norm_(model.parameters(), 1.0)
 ...                 optimizer.step()
 ...                 lr_scheduler.step()
 ...                 optimizer.zero_grad()
@@ -1641,18 +1641,18 @@ from io import BytesIO
 from PIL import Image
 import torch
 from diffusers import DDIMScheduler
-from diffusers import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionImg2ImgPipeline

 # Use the DDIMScheduler scheduler here instead
 scheduler = DDIMScheduler.from_pretrained("stabilityai/stable-diffusion-2-1",
                                            subfolder="scheduler")


-pipe = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-2-1",
-                                            custom_pipeline="stable_diffusion_tensorrt_img2img",
-                                            variant='fp16',
-                                            torch_dtype=torch.float16,
-                                            scheduler=scheduler,)
+pipe = StableDiffusionImg2ImgPipeline.from_pretrained("stabilityai/stable-diffusion-2-1",
+                                                custom_pipeline="stable_diffusion_tensorrt_img2img",
+                                                variant='fp16',
+                                                torch_dtype=torch.float16,
+                                                scheduler=scheduler,)

 # re-use cached folder to save ONNX models and TensorRT Engines
 pipe.set_cached_folder("stabilityai/stable-diffusion-2-1", variant='fp16',)
@@ -13,17 +13,13 @@ from diffusers.configuration_utils import FrozenDict
 from diffusers.image_processor import VaeImageProcessor
 from diffusers.loaders import FromSingleFileMixin, StableDiffusionLoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import (
    PIL_INTERPOLATION,
-    USE_PEFT_BACKEND,
    deprecate,
    logging,
-    scale_lora_layers,
-    unscale_lora_layers,
 )
 from diffusers.utils.torch_utils import randn_tensor

@@ -203,7 +199,6 @@ def get_unweighted_text_embeddings(
    text_input: torch.Tensor,
    chunk_length: int,
    no_boseos_middle: Optional[bool] = True,
-    clip_skip: Optional[int] = None,
 ):
    """
    When the length of tokens is a multiple of the capacity of the text encoder,
@@ -219,20 +214,7 @@ def get_unweighted_text_embeddings(
            # cover the head and the tail by the starting and the ending tokens
            text_input_chunk[:, 0] = text_input[0, 0]
            text_input_chunk[:, -1] = text_input[0, -1]
-            if clip_skip is None:
-                prompt_embeds = pipe.text_encoder(text_input_chunk.to(pipe.device))
-                text_embedding = prompt_embeds[0]
-            else:
-                prompt_embeds = pipe.text_encoder(text_input_chunk.to(pipe.device), output_hidden_states=True)
-                # Access the `hidden_states` first, that contains a tuple of
-                # all the hidden states from the encoder layers. Then index into
-                # the tuple to access the hidden states from the desired layer.
-                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
-                # We also need to apply the final LayerNorm here to not mess with the
-                # representations. The `last_hidden_states` that we typically use for
-                # obtaining the final prompt representations passes through the LayerNorm
-                # layer.
-                text_embedding = pipe.text_encoder.text_model.final_layer_norm(prompt_embeds)
+            text_embedding = pipe.text_encoder(text_input_chunk)[0]

            if no_boseos_middle:
                if i == 0:
@@ -248,10 +230,7 @@ def get_unweighted_text_embeddings(
            text_embeddings.append(text_embedding)
        text_embeddings = torch.concat(text_embeddings, axis=1)
    else:
-        if clip_skip is None:
-            clip_skip = 0
-        prompt_embeds = pipe.text_encoder(text_input, output_hidden_states=True)[-1][-(clip_skip + 1)]
-        text_embeddings = pipe.text_encoder.text_model.final_layer_norm(prompt_embeds)
+        text_embeddings = pipe.text_encoder(text_input)[0]
    return text_embeddings


@@ -263,8 +242,6 @@ def get_weighted_text_embeddings(
    no_boseos_middle: Optional[bool] = False,
    skip_parsing: Optional[bool] = False,
    skip_weighting: Optional[bool] = False,
-    clip_skip=None,
-    lora_scale=None,
 ):
    r"""
    Prompts can be assigned with local weights using brackets. For example,
@@ -291,16 +268,6 @@ def get_weighted_text_embeddings(
        skip_weighting (`bool`, *optional*, defaults to `False`):
            Skip the weighting. When the parsing is skipped, it is forced True.
    """
-    # set lora scale so that monkey patched LoRA
-    # function of text encoder can correctly access it
-    if lora_scale is not None and isinstance(pipe, StableDiffusionLoraLoaderMixin):
-        pipe._lora_scale = lora_scale
-
-        # dynamically adjust the LoRA scale
-        if not USE_PEFT_BACKEND:
-            adjust_lora_scale_text_encoder(pipe.text_encoder, lora_scale)
-        else:
-            scale_lora_layers(pipe.text_encoder, lora_scale)
    max_length = (pipe.tokenizer.model_max_length - 2) * max_embeddings_multiples + 2
    if isinstance(prompt, str):
        prompt = [prompt]
@@ -367,7 +334,10 @@ def get_weighted_text_embeddings(

    # get the embeddings
    text_embeddings = get_unweighted_text_embeddings(
-        pipe, prompt_tokens, pipe.tokenizer.model_max_length, no_boseos_middle=no_boseos_middle, clip_skip=clip_skip
+        pipe,
+        prompt_tokens,
+        pipe.tokenizer.model_max_length,
+        no_boseos_middle=no_boseos_middle,
    )
    prompt_weights = torch.tensor(prompt_weights, dtype=text_embeddings.dtype, device=text_embeddings.device)
    if uncond_prompt is not None:
@@ -376,7 +346,6 @@ def get_weighted_text_embeddings(
            uncond_tokens,
            pipe.tokenizer.model_max_length,
            no_boseos_middle=no_boseos_middle,
-            clip_skip=clip_skip,
        )
        uncond_weights = torch.tensor(uncond_weights, dtype=uncond_embeddings.dtype, device=uncond_embeddings.device)

@@ -393,11 +362,6 @@ def get_weighted_text_embeddings(
            current_mean = uncond_embeddings.float().mean(axis=[-2, -1]).to(uncond_embeddings.dtype)
            uncond_embeddings *= (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)

-    if pipe.text_encoder is not None:
-        if isinstance(pipe, StableDiffusionLoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(pipe.text_encoder, lora_scale)
-
    if uncond_prompt is not None:
        return text_embeddings, uncond_embeddings
    return text_embeddings, None
@@ -585,8 +549,6 @@ class StableDiffusionLongPromptWeightingPipeline(
        max_embeddings_multiples=3,
        prompt_embeds: Optional[torch.Tensor] = None,
        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        clip_skip: Optional[int] = None,
-        lora_scale: Optional[float] = None,
    ):
        r"""
        Encodes the prompt into text encoder hidden states.
@@ -635,8 +597,6 @@ class StableDiffusionLongPromptWeightingPipeline(
                prompt=prompt,
                uncond_prompt=negative_prompt if do_classifier_free_guidance else None,
                max_embeddings_multiples=max_embeddings_multiples,
-                clip_skip=clip_skip,
-                lora_scale=lora_scale,
            )
            if prompt_embeds is None:
                prompt_embeds = prompt_embeds1
@@ -830,7 +790,6 @@ class StableDiffusionLongPromptWeightingPipeline(
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
        is_cancelled_callback: Optional[Callable[[], bool]] = None,
-        clip_skip: Optional[int] = None,
        callback_steps: int = 1,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
    ):
@@ -906,9 +865,6 @@ class StableDiffusionLongPromptWeightingPipeline(
            is_cancelled_callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. If the function returns
                `True`, the inference will be cancelled.
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
@@ -947,7 +903,6 @@ class StableDiffusionLongPromptWeightingPipeline(
        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
        # corresponds to doing no classifier free guidance.
        do_classifier_free_guidance = guidance_scale > 1.0
-        lora_scale = cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None

        # 3. Encode input prompt
        prompt_embeds = self._encode_prompt(
@@ -959,8 +914,6 @@ class StableDiffusionLongPromptWeightingPipeline(
            max_embeddings_multiples,
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=negative_prompt_embeds,
-            clip_skip=clip_skip,
-            lora_scale=lora_scale,
        )
        dtype = prompt_embeds.dtype

@@ -1091,7 +1044,6 @@ class StableDiffusionLongPromptWeightingPipeline(
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
        is_cancelled_callback: Optional[Callable[[], bool]] = None,
-        clip_skip=None,
        callback_steps: int = 1,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
    ):
@@ -1149,9 +1101,6 @@ class StableDiffusionLongPromptWeightingPipeline(
            is_cancelled_callback (`Callable`, *optional*):
                A function that will be called every `callback_steps` steps during inference. If the function returns
                `True`, the inference will be cancelled.
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
            callback_steps (`int`, *optional*, defaults to 1):
                The frequency at which the `callback` function will be called. If not specified, the callback will be
                called at every step.
@@ -1186,7 +1135,6 @@ class StableDiffusionLongPromptWeightingPipeline(
            return_dict=return_dict,
            callback=callback,
            is_cancelled_callback=is_cancelled_callback,
-            clip_skip=clip_skip,
            callback_steps=callback_steps,
            cross_attention_kwargs=cross_attention_kwargs,
        )
@@ -25,25 +25,21 @@ from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
 from diffusers.loaders import (
    FromSingleFileMixin,
    IPAdapterMixin,
-    StableDiffusionXLLoraLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
    TextualInversionLoaderMixin,
 )
 from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel
 from diffusers.models.attention_processor import AttnProcessor2_0, XFormersAttnProcessor
-from diffusers.models.lora import adjust_lora_scale_text_encoder
 from diffusers.pipelines.pipeline_utils import StableDiffusionMixin
 from diffusers.pipelines.stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import (
-    USE_PEFT_BACKEND,
    deprecate,
    is_accelerate_available,
    is_accelerate_version,
    is_invisible_watermark_available,
    logging,
    replace_example_docstring,
-    scale_lora_layers,
-    unscale_lora_layers,
 )
 from diffusers.utils.torch_utils import randn_tensor

@@ -265,7 +261,6 @@ def get_weighted_text_embeddings_sdxl(
    num_images_per_prompt: int = 1,
    device: Optional[torch.device] = None,
    clip_skip: Optional[int] = None,
-    lora_scale: Optional[int] = None,
 ):
    """
    This function can process long prompt with weights, no length limitation
@@ -286,24 +281,6 @@ def get_weighted_text_embeddings_sdxl(
    """
    device = device or pipe._execution_device

-    # set lora scale so that monkey patched LoRA
-    # function of text encoder can correctly access it
-    if lora_scale is not None and isinstance(pipe, StableDiffusionXLLoraLoaderMixin):
-        pipe._lora_scale = lora_scale
-
-        # dynamically adjust the LoRA scale
-        if pipe.text_encoder is not None:
-            if not USE_PEFT_BACKEND:
-                adjust_lora_scale_text_encoder(pipe.text_encoder, lora_scale)
-            else:
-                scale_lora_layers(pipe.text_encoder, lora_scale)
-
-        if pipe.text_encoder_2 is not None:
-            if not USE_PEFT_BACKEND:
-                adjust_lora_scale_text_encoder(pipe.text_encoder_2, lora_scale)
-            else:
-                scale_lora_layers(pipe.text_encoder_2, lora_scale)
-
    if prompt_2:
        prompt = f"{prompt} {prompt_2}"

@@ -452,16 +429,6 @@ def get_weighted_text_embeddings_sdxl(
        bs_embed * num_images_per_prompt, -1
    )

-    if pipe.text_encoder is not None:
-        if isinstance(pipe, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(pipe.text_encoder, lora_scale)
-
-    if pipe.text_encoder_2 is not None:
-        if isinstance(pipe, StableDiffusionXLLoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(pipe.text_encoder_2, lora_scale)
-
    return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds


@@ -582,7 +549,7 @@ class SDXLLongPromptWeightingPipeline(
    StableDiffusionMixin,
    FromSingleFileMixin,
    IPAdapterMixin,
-    StableDiffusionXLLoraLoaderMixin,
+    StableDiffusionLoraLoaderMixin,
    TextualInversionLoaderMixin,
 ):
    r"""
@@ -594,8 +561,8 @@ class SDXLLongPromptWeightingPipeline(
    The pipeline also inherits the following loading methods:
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters
-        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
+        - [`~loaders.StableDiffusionLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings

    Args:
@@ -776,7 +743,7 @@ class SDXLLongPromptWeightingPipeline(

        # set lora scale so that monkey patched LoRA
        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, StableDiffusionXLLoraLoaderMixin):
+        if lora_scale is not None and isinstance(self, StableDiffusionLoraLoaderMixin):
            self._lora_scale = lora_scale

        if prompt is not None and isinstance(prompt, str):
@@ -1645,9 +1612,7 @@ class SDXLLongPromptWeightingPipeline(
                image_embeds = torch.cat([negative_image_embeds, image_embeds])

        # 3. Encode input prompt
-        lora_scale = (
-            self._cross_attention_kwargs.get("scale", None) if self._cross_attention_kwargs is not None else None
-        )
+        (self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None)

        negative_prompt = negative_prompt if negative_prompt is not None else ""

@@ -1662,7 +1627,6 @@ class SDXLLongPromptWeightingPipeline(
            neg_prompt=negative_prompt,
            num_images_per_prompt=num_images_per_prompt,
            clip_skip=clip_skip,
-            lora_scale=lora_scale,
        )
        dtype = prompt_embeds.dtype

@@ -18,7 +18,8 @@
 import gc
 import os
 from collections import OrderedDict
-from typing import List, Optional, Tuple, Union
+from copy import copy
+from typing import List, Optional, Union

 import numpy as np
 import onnx
@@ -26,11 +27,9 @@ import onnx_graphsurgeon as gs
 import PIL.Image
 import tensorrt as trt
 import torch
-from cuda import cudart
 from huggingface_hub import snapshot_download
 from huggingface_hub.utils import validate_hf_hub_args
 from onnx import shape_inference
-from packaging import version
 from polygraphy import cuda
 from polygraphy.backend.common import bytes_from_path
 from polygraphy.backend.onnx.loader import fold_constants
@@ -42,13 +41,12 @@ from polygraphy.backend.trt import (
    network_from_onnx_path,
    save_engine,
 )
+from polygraphy.backend.trt import util as trt_util
 from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

-from diffusers import DiffusionPipeline
-from diffusers.configuration_utils import FrozenDict, deprecate
-from diffusers.image_processor import VaeImageProcessor
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion import (
+    StableDiffusionImg2ImgPipeline,
    StableDiffusionPipelineOutput,
    StableDiffusionSafetyChecker,
 )
@@ -60,7 +58,7 @@ from diffusers.utils import logging
 """
 Installation instructions
 python3 -m pip install --upgrade transformers diffusers>=0.16.0
-python3 -m pip install --upgrade tensorrt-cu12==10.2.0
+python3 -m pip install --upgrade tensorrt>=8.6.1
 python3 -m pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com
 python3 -m pip install onnxruntime
 """
@@ -90,6 +88,10 @@ else:
 torch_to_numpy_dtype_dict = {value: key for (key, value) in numpy_to_torch_dtype_dict.items()}


+def device_view(t):
+    return cuda.DeviceView(ptr=t.data_ptr(), shape=t.shape, dtype=torch_to_numpy_dtype_dict[t.dtype])
+
+
 def preprocess_image(image):
    """
    image: torch.Tensor
@@ -123,8 +125,10 @@ class Engine:
        onnx_path,
        fp16,
        input_profile=None,
+        enable_preview=False,
        enable_all_tactics=False,
        timing_cache=None,
+        workspace_size=0,
    ):
        logger.warning(f"Building TensorRT engine for {onnx_path}: {self.engine_path}")
        p = Profile()
@@ -133,13 +137,20 @@ class Engine:
                assert len(dims) == 3
                p.add(name, min=dims[0], opt=dims[1], max=dims[2])

-        extra_build_args = {}
+        config_kwargs = {}
+
+        config_kwargs["preview_features"] = [trt.PreviewFeature.DISABLE_EXTERNAL_TACTIC_SOURCES_FOR_CORE_0805]
+        if enable_preview:
+            # Faster dynamic shapes made optional since it increases engine build time.
+            config_kwargs["preview_features"].append(trt.PreviewFeature.FASTER_DYNAMIC_SHAPES_0805)
+        if workspace_size > 0:
+            config_kwargs["memory_pool_limits"] = {trt.MemoryPoolType.WORKSPACE: workspace_size}
        if not enable_all_tactics:
-            extra_build_args["tactic_sources"] = []
+            config_kwargs["tactic_sources"] = []

        engine = engine_from_network(
            network_from_onnx_path(onnx_path, flags=[trt.OnnxParserFlag.NATIVE_INSTANCENORM]),
-            config=CreateConfig(fp16=fp16, profiles=[p], load_timing_cache=timing_cache, **extra_build_args),
+            config=CreateConfig(fp16=fp16, profiles=[p], load_timing_cache=timing_cache, **config_kwargs),
            save_timing_cache=timing_cache,
        )
        save_engine(engine, path=self.engine_path)
@@ -152,24 +163,28 @@ class Engine:
        self.context = self.engine.create_execution_context()

    def allocate_buffers(self, shape_dict=None, device="cuda"):
-        for binding in range(self.engine.num_io_tensors):
-            name = self.engine.get_tensor_name(binding)
-            if shape_dict and name in shape_dict:
-                shape = shape_dict[name]
+        for idx in range(trt_util.get_bindings_per_profile(self.engine)):
+            binding = self.engine[idx]
+            if shape_dict and binding in shape_dict:
+                shape = shape_dict[binding]
            else:
-                shape = self.engine.get_tensor_shape(name)
-            dtype = trt.nptype(self.engine.get_tensor_dtype(name))
-            if self.engine.get_tensor_mode(name) == trt.TensorIOMode.INPUT:
-                self.context.set_input_shape(name, shape)
+                shape = self.engine.get_binding_shape(binding)
+            dtype = trt.nptype(self.engine.get_binding_dtype(binding))
+            if self.engine.binding_is_input(binding):
+                self.context.set_binding_shape(idx, shape)
            tensor = torch.empty(tuple(shape), dtype=numpy_to_torch_dtype_dict[dtype]).to(device=device)
-            self.tensors[name] = tensor
+            self.tensors[binding] = tensor
+            self.buffers[binding] = cuda.DeviceView(ptr=tensor.data_ptr(), shape=shape, dtype=dtype)

    def infer(self, feed_dict, stream):
+        start_binding, end_binding = trt_util.get_active_profile_bindings(self.context)
+        # shallow copy of ordered dict
+        device_buffers = copy(self.buffers)
        for name, buf in feed_dict.items():
-            self.tensors[name].copy_(buf)
-        for name, tensor in self.tensors.items():
-            self.context.set_tensor_address(name, tensor.data_ptr())
-        noerror = self.context.execute_async_v3(stream)
+            assert isinstance(buf, cuda.DeviceView)
+            device_buffers[name] = buf
+        bindings = [0] * start_binding + [buf.ptr for buf in device_buffers.values()]
+        noerror = self.context.execute_async_v2(bindings=bindings, stream_handle=stream.ptr)
        if not noerror:
            raise ValueError("ERROR: inference failed.")

@@ -310,8 +325,10 @@ def build_engines(
    force_engine_rebuild=False,
    static_batch=False,
    static_shape=True,
+    enable_preview=False,
    enable_all_tactics=False,
    timing_cache=None,
+    max_workspace_size=0,
 ):
    built_engines = {}
    if not os.path.isdir(onnx_dir):
@@ -376,7 +393,9 @@ def build_engines(
                    static_batch=static_batch,
                    static_shape=static_shape,
                ),
+                enable_preview=enable_preview,
                timing_cache=timing_cache,
+                workspace_size=max_workspace_size,
            )
        built_engines[model_name] = engine

@@ -655,7 +674,7 @@ def make_VAEEncoder(model, device, max_batch_size, embedding_dim, inpaint=False)
    return VAEEncoder(model, device=device, max_batch_size=max_batch_size, embedding_dim=embedding_dim)


-class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
+class TensorRTStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline):
    r"""
    Pipeline for image-to-image generation using TensorRT accelerated Stable Diffusion.

@@ -683,8 +702,6 @@ class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
    """

-    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
-
    def __init__(
        self,
        vae: AutoencoderKL,
@@ -705,86 +722,24 @@ class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
        onnx_dir: str = "onnx",
        # TensorRT engine build parameters
        engine_dir: str = "engine",
+        build_preview_features: bool = True,
        force_engine_rebuild: bool = False,
        timing_cache: str = "timing_cache",
    ):
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
-            version.parse(unet.config._diffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
+        super().__init__(
+            vae,
+            text_encoder,
+            tokenizer,
+            unet,
+            scheduler,
            safety_checker=safety_checker,
            feature_extractor=feature_extractor,
            image_encoder=image_encoder,
+            requires_safety_checker=requires_safety_checker,
        )

+        self.vae.forward = self.vae.decode
+
        self.stages = stages
        self.image_height, self.image_width = image_height, image_width
        self.inpaint = False
@@ -795,6 +750,7 @@ class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
        self.timing_cache = timing_cache
        self.build_static_batch = False
        self.build_dynamic_shape = False
+        self.build_preview_features = build_preview_features

        self.max_batch_size = max_batch_size
        # TODO: Restrict batch size to 4 for larger image dimensions as a WAR for TensorRT limitation.
@@ -805,11 +761,6 @@ class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
        self.models = {}  # loaded in __loadModels()
        self.engine = {}  # loaded in build_engines()

-        self.vae.forward = self.vae.decode
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)
-
    def __loadModels(self):
        # Load pipeline models
        self.embedding_dim = self.text_encoder.config.hidden_size
@@ -828,33 +779,6 @@ class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
        if "vae_encoder" in self.stages:
            self.models["vae_encoder"] = make_VAEEncoder(self.vae, **models_args)

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(
-        self, image: Union[torch.Tensor, PIL.Image.Image], device: torch.device, dtype: torch.dtype
-    ) -> Tuple[Union[torch.Tensor, PIL.Image.Image], Optional[bool]]:
-        r"""
-        Runs the safety checker on the given image.
-        Args:
-            image (Union[torch.Tensor, PIL.Image.Image]): The input image to be checked.
-            device (torch.device): The device to run the safety checker on.
-            dtype (torch.dtype): The data type of the input image.
-        Returns:
-            (image, has_nsfw_concept) Tuple[Union[torch.Tensor, PIL.Image.Image], Optional[bool]]: A tuple containing the processed image and
-            a boolean indicating whether the image has a NSFW (Not Safe for Work) concept.
-        """
-        if self.safety_checker is None:
-            has_nsfw_concept = None
-        else:
-            if torch.is_tensor(image):
-                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
-            else:
-                feature_extractor_input = self.image_processor.numpy_to_pil(image)
-            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
-            )
-        return image, has_nsfw_concept
-
    @classmethod
    @validate_hf_hub_args
    def set_cached_folder(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
@@ -902,6 +826,7 @@ class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
            force_engine_rebuild=self.force_engine_rebuild,
            static_batch=self.build_static_batch,
            static_shape=not self.build_dynamic_shape,
+            enable_preview=self.build_preview_features,
            timing_cache=self.timing_cache,
        )

@@ -925,7 +850,9 @@ class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
        return tuple(init_images)

    def __encode_image(self, init_image):
-        init_latents = runEngine(self.engine["vae_encoder"], {"images": init_image}, self.stream)["latent"]
+        init_latents = runEngine(self.engine["vae_encoder"], {"images": device_view(init_image)}, self.stream)[
+            "latent"
+        ]
        init_latents = 0.18215 * init_latents
        return init_latents

@@ -954,8 +881,9 @@ class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
            .to(self.torch_device)
        )

+        text_input_ids_inp = device_view(text_input_ids)
        # NOTE: output tensor for CLIP must be cloned because it will be overwritten when called again for negative prompt
-        text_embeddings = runEngine(self.engine["clip"], {"input_ids": text_input_ids}, self.stream)[
+        text_embeddings = runEngine(self.engine["clip"], {"input_ids": text_input_ids_inp}, self.stream)[
            "text_embeddings"
        ].clone()

@@ -971,7 +899,8 @@ class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
            .input_ids.type(torch.int32)
            .to(self.torch_device)
        )
-        uncond_embeddings = runEngine(self.engine["clip"], {"input_ids": uncond_input_ids}, self.stream)[
+        uncond_input_ids_inp = device_view(uncond_input_ids)
+        uncond_embeddings = runEngine(self.engine["clip"], {"input_ids": uncond_input_ids_inp}, self.stream)[
            "text_embeddings"
        ]

@@ -995,15 +924,18 @@ class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
            # Predict the noise residual
            timestep_float = timestep.float() if timestep.dtype != torch.float32 else timestep

+            sample_inp = device_view(latent_model_input)
+            timestep_inp = device_view(timestep_float)
+            embeddings_inp = device_view(text_embeddings)
            noise_pred = runEngine(
                self.engine["unet"],
-                {"sample": latent_model_input, "timestep": timestep_float, "encoder_hidden_states": text_embeddings},
+                {"sample": sample_inp, "timestep": timestep_inp, "encoder_hidden_states": embeddings_inp},
                self.stream,
            )["latent"]

            # Perform guidance
            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-            noise_pred = noise_pred_uncond + self._guidance_scale * (noise_pred_text - noise_pred_uncond)
+            noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)

            latents = self.scheduler.step(noise_pred, timestep, latents).prev_sample

@@ -1011,12 +943,12 @@ class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
        return latents

    def __decode_latent(self, latents):
-        images = runEngine(self.engine["vae"], {"latent": latents}, self.stream)["images"]
+        images = runEngine(self.engine["vae"], {"latent": device_view(latents)}, self.stream)["images"]
        images = (images / 2 + 0.5).clamp(0, 1)
        return images.cpu().permute(0, 2, 3, 1).float().numpy()

    def __loadResources(self, image_height, image_width, batch_size):
-        self.stream = cudart.cudaStreamCreate()[1]
+        self.stream = cuda.Stream()

        # Allocate buffers for TensorRT engine bindings
        for model_name, obj in self.models.items():
@@ -1129,6 +1061,5 @@ class TensorRTStableDiffusionImg2ImgPipeline(DiffusionPipeline):
            # VAE decode latent
            images = self.__decode_latent(latents)

-        images, has_nsfw_concept = self.run_safety_checker(images, self.torch_device, text_embeddings.dtype)
        images = self.numpy_to_pil(images)
-        return StableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=images, nsfw_content_detected=None)
@@ -1,8 +1,6 @@
 import argparse
-import os

 import torch
-from huggingface_hub import create_repo, upload_folder
 from safetensors.torch import load_file, save_file


@@ -27,14 +25,8 @@ def convert_motion_module(original_state_dict):

 def get_args():
    parser = argparse.ArgumentParser()
-    parser.add_argument("--ckpt_path", type=str, required=True, help="Path to checkpoint")
-    parser.add_argument("--output_path", type=str, required=True, help="Path to output directory")
-    parser.add_argument(
-        "--push_to_hub",
-        action="store_true",
-        default=False,
-        help="Whether to push the converted model to the HF or not",
-    )
+    parser.add_argument("--ckpt_path", type=str, required=True)
+    parser.add_argument("--output_path", type=str, required=True)

    return parser.parse_args()

@@ -59,11 +51,4 @@ if __name__ == "__main__":
            continue
        output_dict.update({f"unet.{module_name}": params})

-    os.makedirs(args.output_path, exist_ok=True)
-
-    filepath = os.path.join(args.output_path, "diffusion_pytorch_model.safetensors")
-    save_file(output_dict, filepath)
-
-    if args.push_to_hub:
-        repo_id = create_repo(args.output_path, exist_ok=True).repo_id
-        upload_folder(repo_id=repo_id, folder_path=args.output_path, repo_type="model")
+    save_file(output_dict, f"{args.output_path}/diffusion_pytorch_model.safetensors")
@@ -1,83 +0,0 @@
-import argparse
-from typing import Dict
-
-import torch
-import torch.nn as nn
-
-from diffusers import SparseControlNetModel
-
-
-KEYS_RENAME_MAPPING = {
-    ".attention_blocks.0": ".attn1",
-    ".attention_blocks.1": ".attn2",
-    ".attn1.pos_encoder": ".pos_embed",
-    ".ff_norm": ".norm3",
-    ".norms.0": ".norm1",
-    ".norms.1": ".norm2",
-    ".temporal_transformer": "",
-}
-
-
-def convert(original_state_dict: Dict[str, nn.Module]) -> Dict[str, nn.Module]:
-    converted_state_dict = {}
-
-    for key in list(original_state_dict.keys()):
-        renamed_key = key
-        for new_name, old_name in KEYS_RENAME_MAPPING.items():
-            renamed_key = renamed_key.replace(new_name, old_name)
-        converted_state_dict[renamed_key] = original_state_dict.pop(key)
-
-    return converted_state_dict
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--ckpt_path", type=str, required=True, help="Path to checkpoint")
-    parser.add_argument("--output_path", type=str, required=True, help="Path to output directory")
-    parser.add_argument(
-        "--max_motion_seq_length",
-        type=int,
-        default=32,
-        help="Max motion sequence length supported by the motion adapter",
-    )
-    parser.add_argument(
-        "--conditioning_channels", type=int, default=4, help="Number of channels in conditioning input to controlnet"
-    )
-    parser.add_argument(
-        "--use_simplified_condition_embedding",
-        action="store_true",
-        default=False,
-        help="Whether or not to use simplified condition embedding. When `conditioning_channels==4` i.e. latent inputs, set this to `True`. When `conditioning_channels==3` i.e. image inputs, set this to `False`",
-    )
-    parser.add_argument(
-        "--save_fp16",
-        action="store_true",
-        default=False,
-        help="Whether or not to save model in fp16 precision along with fp32",
-    )
-    parser.add_argument(
-        "--push_to_hub", action="store_true", default=False, help="Whether or not to push saved model to the HF hub"
-    )
-    return parser.parse_args()
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    state_dict = torch.load(args.ckpt_path, map_location="cpu")
-    if "state_dict" in state_dict.keys():
-        state_dict: dict = state_dict["state_dict"]
-
-    controlnet = SparseControlNetModel(
-        conditioning_channels=args.conditioning_channels,
-        motion_max_seq_length=args.max_motion_seq_length,
-        use_simplified_condition_embedding=args.use_simplified_condition_embedding,
-    )
-
-    state_dict = convert(state_dict)
-    controlnet.load_state_dict(state_dict, strict=True)
-
-    controlnet.save_pretrained(args.output_path, push_to_hub=args.push_to_hub)
-    if args.save_fp16:
-        controlnet = controlnet.to(dtype=torch.float16)
-        controlnet.save_pretrained(args.output_path, variant="fp16", push_to_hub=args.push_to_hub)
@@ -1,279 +0,0 @@
-# Run this script to convert the Stable Cascade model weights to a diffusers pipeline.
-import argparse
-import json
-import os
-from contextlib import nullcontext
-
-import torch
-from safetensors.torch import load_file
-from transformers import (
-    AutoTokenizer,
-    T5EncoderModel,
-)
-
-from diffusers import (
-    AutoencoderOobleck,
-    CosineDPMSolverMultistepScheduler,
-    StableAudioDiTModel,
-    StableAudioPipeline,
-    StableAudioProjectionModel,
-)
-from diffusers.models.modeling_utils import load_model_dict_into_meta
-from diffusers.utils import is_accelerate_available
-
-
-if is_accelerate_available():
-    from accelerate import init_empty_weights
-
-
-def convert_stable_audio_state_dict_to_diffusers(state_dict, num_autoencoder_layers=5):
-    projection_model_state_dict = {
-        k.replace("conditioner.conditioners.", "").replace("embedder.embedding", "time_positional_embedding"): v
-        for (k, v) in state_dict.items()
-        if "conditioner.conditioners" in k
-    }
-
-    # NOTE: we assume here that there's no projection layer from the text encoder to the latent space, script should be adapted a bit if there is.
-    for key, value in list(projection_model_state_dict.items()):
-        new_key = key.replace("seconds_start", "start_number_conditioner").replace(
-            "seconds_total", "end_number_conditioner"
-        )
-        projection_model_state_dict[new_key] = projection_model_state_dict.pop(key)
-
-    model_state_dict = {k.replace("model.model.", ""): v for (k, v) in state_dict.items() if "model.model." in k}
-    for key, value in list(model_state_dict.items()):
-        # attention layers
-        new_key = (
-            key.replace("transformer.", "")
-            .replace("layers", "transformer_blocks")
-            .replace("self_attn", "attn1")
-            .replace("cross_attn", "attn2")
-            .replace("ff.ff", "ff.net")
-        )
-        new_key = (
-            new_key.replace("pre_norm", "norm1")
-            .replace("cross_attend_norm", "norm2")
-            .replace("ff_norm", "norm3")
-            .replace("to_out", "to_out.0")
-        )
-        new_key = new_key.replace("gamma", "weight").replace("beta", "bias")  # replace layernorm
-
-        # other layers
-        new_key = (
-            new_key.replace("project", "proj")
-            .replace("to_timestep_embed", "timestep_proj")
-            .replace("timestep_features", "time_proj")
-            .replace("to_global_embed", "global_proj")
-            .replace("to_cond_embed", "cross_attention_proj")
-        )
-
-        # we're using diffusers implementation of time_proj (GaussianFourierProjection) which creates a 1D tensor
-        if new_key == "time_proj.weight":
-            model_state_dict[key] = model_state_dict[key].squeeze(1)
-
-        if "to_qkv" in new_key:
-            q, k, v = torch.chunk(model_state_dict.pop(key), 3, dim=0)
-            model_state_dict[new_key.replace("qkv", "q")] = q
-            model_state_dict[new_key.replace("qkv", "k")] = k
-            model_state_dict[new_key.replace("qkv", "v")] = v
-        elif "to_kv" in new_key:
-            k, v = torch.chunk(model_state_dict.pop(key), 2, dim=0)
-            model_state_dict[new_key.replace("kv", "k")] = k
-            model_state_dict[new_key.replace("kv", "v")] = v
-        else:
-            model_state_dict[new_key] = model_state_dict.pop(key)
-
-    autoencoder_state_dict = {
-        k.replace("pretransform.model.", "").replace("coder.layers.0", "coder.conv1"): v
-        for (k, v) in state_dict.items()
-        if "pretransform.model." in k
-    }
-
-    for key, _ in list(autoencoder_state_dict.items()):
-        new_key = key
-        if "coder.layers" in new_key:
-            # get idx of the layer
-            idx = int(new_key.split("coder.layers.")[1].split(".")[0])
-
-            new_key = new_key.replace(f"coder.layers.{idx}", f"coder.block.{idx-1}")
-
-            if "encoder" in new_key:
-                for i in range(3):
-                    new_key = new_key.replace(f"block.{idx-1}.layers.{i}", f"block.{idx-1}.res_unit{i+1}")
-                new_key = new_key.replace(f"block.{idx-1}.layers.3", f"block.{idx-1}.snake1")
-                new_key = new_key.replace(f"block.{idx-1}.layers.4", f"block.{idx-1}.conv1")
-            else:
-                for i in range(2, 5):
-                    new_key = new_key.replace(f"block.{idx-1}.layers.{i}", f"block.{idx-1}.res_unit{i-1}")
-                new_key = new_key.replace(f"block.{idx-1}.layers.0", f"block.{idx-1}.snake1")
-                new_key = new_key.replace(f"block.{idx-1}.layers.1", f"block.{idx-1}.conv_t1")
-
-            new_key = new_key.replace("layers.0.beta", "snake1.beta")
-            new_key = new_key.replace("layers.0.alpha", "snake1.alpha")
-            new_key = new_key.replace("layers.2.beta", "snake2.beta")
-            new_key = new_key.replace("layers.2.alpha", "snake2.alpha")
-            new_key = new_key.replace("layers.1.bias", "conv1.bias")
-            new_key = new_key.replace("layers.1.weight_", "conv1.weight_")
-            new_key = new_key.replace("layers.3.bias", "conv2.bias")
-            new_key = new_key.replace("layers.3.weight_", "conv2.weight_")
-
-            if idx == num_autoencoder_layers + 1:
-                new_key = new_key.replace(f"block.{idx-1}", "snake1")
-            elif idx == num_autoencoder_layers + 2:
-                new_key = new_key.replace(f"block.{idx-1}", "conv2")
-
-        else:
-            new_key = new_key
-
-        value = autoencoder_state_dict.pop(key)
-        if "snake" in new_key:
-            value = value.unsqueeze(0).unsqueeze(-1)
-        if new_key in autoencoder_state_dict:
-            raise ValueError(f"{new_key} already in state dict.")
-        autoencoder_state_dict[new_key] = value
-
-    return model_state_dict, projection_model_state_dict, autoencoder_state_dict
-
-
-parser = argparse.ArgumentParser(description="Convert Stable Audio 1.0 model weights to a diffusers pipeline")
-parser.add_argument("--model_folder_path", type=str, help="Location of Stable Audio weights and config")
-parser.add_argument("--use_safetensors", action="store_true", help="Use SafeTensors for conversion")
-parser.add_argument(
-    "--save_directory",
-    type=str,
-    default="./tmp/stable-audio-1.0",
-    help="Directory to save a pipeline to. Will be created if it doesn't exist.",
-)
-parser.add_argument(
-    "--repo_id",
-    type=str,
-    default="stable-audio-1.0",
-    help="Hub organization to save the pipelines to",
-)
-parser.add_argument("--push_to_hub", action="store_true", help="Push to hub")
-parser.add_argument("--variant", type=str, help="Set to bf16 to save bfloat16 weights")
-
-args = parser.parse_args()
-
-checkpoint_path = (
-    os.path.join(args.model_folder_path, "model.safetensors")
-    if args.use_safetensors
-    else os.path.join(args.model_folder_path, "model.ckpt")
-)
-config_path = os.path.join(args.model_folder_path, "model_config.json")
-
-device = "cpu"
-if args.variant == "bf16":
-    dtype = torch.bfloat16
-else:
-    dtype = torch.float32
-
-with open(config_path) as f_in:
-    config_dict = json.load(f_in)
-
-conditioning_dict = {
-    conditioning["id"]: conditioning["config"] for conditioning in config_dict["model"]["conditioning"]["configs"]
-}
-
-t5_model_config = conditioning_dict["prompt"]
-
-# T5 Text encoder
-text_encoder = T5EncoderModel.from_pretrained(t5_model_config["t5_model_name"])
-tokenizer = AutoTokenizer.from_pretrained(
-    t5_model_config["t5_model_name"], truncation=True, model_max_length=t5_model_config["max_length"]
-)
-
-
-# scheduler
-scheduler = CosineDPMSolverMultistepScheduler(
-    sigma_min=0.3,
-    sigma_max=500,
-    solver_order=2,
-    prediction_type="v_prediction",
-    sigma_data=1.0,
-    sigma_schedule="exponential",
-)
-ctx = init_empty_weights if is_accelerate_available() else nullcontext
-
-
-if args.use_safetensors:
-    orig_state_dict = load_file(checkpoint_path, device=device)
-else:
-    orig_state_dict = torch.load(checkpoint_path, map_location=device)
-
-
-model_config = config_dict["model"]["diffusion"]["config"]
-
-model_state_dict, projection_model_state_dict, autoencoder_state_dict = convert_stable_audio_state_dict_to_diffusers(
-    orig_state_dict
-)
-
-
-with ctx():
-    projection_model = StableAudioProjectionModel(
-        text_encoder_dim=text_encoder.config.d_model,
-        conditioning_dim=config_dict["model"]["conditioning"]["cond_dim"],
-        min_value=conditioning_dict["seconds_start"][
-            "min_val"
-        ],  # assume `seconds_start` and `seconds_total` have the same min / max values.
-        max_value=conditioning_dict["seconds_start"][
-            "max_val"
-        ],  # assume `seconds_start` and `seconds_total` have the same min / max values.
-    )
-if is_accelerate_available():
-    load_model_dict_into_meta(projection_model, projection_model_state_dict)
-else:
-    projection_model.load_state_dict(projection_model_state_dict)
-
-attention_head_dim = model_config["embed_dim"] // model_config["num_heads"]
-with ctx():
-    model = StableAudioDiTModel(
-        sample_size=int(config_dict["sample_size"])
-        / int(config_dict["model"]["pretransform"]["config"]["downsampling_ratio"]),
-        in_channels=model_config["io_channels"],
-        num_layers=model_config["depth"],
-        attention_head_dim=attention_head_dim,
-        num_key_value_attention_heads=model_config["cond_token_dim"] // attention_head_dim,
-        num_attention_heads=model_config["num_heads"],
-        out_channels=model_config["io_channels"],
-        cross_attention_dim=model_config["cond_token_dim"],
-        time_proj_dim=256,
-        global_states_input_dim=model_config["global_cond_dim"],
-        cross_attention_input_dim=model_config["cond_token_dim"],
-    )
-if is_accelerate_available():
-    load_model_dict_into_meta(model, model_state_dict)
-else:
-    model.load_state_dict(model_state_dict)
-
-
-autoencoder_config = config_dict["model"]["pretransform"]["config"]
-with ctx():
-    autoencoder = AutoencoderOobleck(
-        encoder_hidden_size=autoencoder_config["encoder"]["config"]["channels"],
-        downsampling_ratios=autoencoder_config["encoder"]["config"]["strides"],
-        decoder_channels=autoencoder_config["decoder"]["config"]["channels"],
-        decoder_input_channels=autoencoder_config["decoder"]["config"]["latent_dim"],
-        audio_channels=autoencoder_config["io_channels"],
-        channel_multiples=autoencoder_config["encoder"]["config"]["c_mults"],
-        sampling_rate=config_dict["sample_rate"],
-    )
-
-if is_accelerate_available():
-    load_model_dict_into_meta(autoencoder, autoencoder_state_dict)
-else:
-    autoencoder.load_state_dict(autoencoder_state_dict)
-
-
-# Prior pipeline
-pipeline = StableAudioPipeline(
-    transformer=model,
-    tokenizer=tokenizer,
-    text_encoder=text_encoder,
-    scheduler=scheduler,
-    vae=autoencoder,
-    projection_model=projection_model,
-)
-pipeline.to(dtype).save_pretrained(
-    args.save_directory, repo_id=args.repo_id, push_to_hub=args.push_to_hub, variant=args.variant
-)
@@ -101,7 +101,7 @@ _deps = [
    "filelock",
    "flax>=0.4.1",
    "hf-doc-builder>=0.3.0",
-    "huggingface-hub>=0.24.5",
+    "huggingface-hub>=0.23.2",
    "requests-mock==1.10.0",
    "importlib_metadata",
    "invisible-watermark>=0.2.0",
@@ -79,7 +79,6 @@ else:
            "AuraFlowTransformer2DModel",
            "AutoencoderKL",
            "AutoencoderKLTemporalDecoder",
-            "AutoencoderOobleck",
            "AutoencoderTiny",
            "ConsistencyDecoderVAE",
            "ControlNetModel",
@@ -100,8 +99,6 @@ else:
            "SD3ControlNetModel",
            "SD3MultiControlNetModel",
            "SD3Transformer2DModel",
-            "SparseControlNetModel",
-            "StableAudioDiTModel",
            "StableCascadeUNet",
            "T2IAdapter",
            "T5FilmDecoder",
@@ -212,7 +209,7 @@ except OptionalDependencyNotAvailable:
    ]

 else:
-    _import_structure["schedulers"].extend(["CosineDPMSolverMultistepScheduler", "DPMSolverSDEScheduler"])
+    _import_structure["schedulers"].extend(["DPMSolverSDEScheduler"])

 try:
    if not (is_torch_available() and is_transformers_available()):
@@ -232,10 +229,8 @@ else:
            "AmusedImg2ImgPipeline",
            "AmusedInpaintPipeline",
            "AmusedPipeline",
-            "AnimateDiffControlNetPipeline",
            "AnimateDiffPipeline",
            "AnimateDiffSDXLPipeline",
-            "AnimateDiffSparseControlNetPipeline",
            "AnimateDiffVideoToVideoPipeline",
            "AudioLDM2Pipeline",
            "AudioLDM2ProjectionModel",
@@ -296,8 +291,6 @@ else:
            "SemanticStableDiffusionPipeline",
            "ShapEImg2ImgPipeline",
            "ShapEPipeline",
-            "StableAudioPipeline",
-            "StableAudioProjectionModel",
            "StableCascadeCombinedPipeline",
            "StableCascadeDecoderPipeline",
            "StableCascadePriorPipeline",
@@ -520,7 +513,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            AuraFlowTransformer2DModel,
            AutoencoderKL,
            AutoencoderKLTemporalDecoder,
-            AutoencoderOobleck,
            AutoencoderTiny,
            ConsistencyDecoderVAE,
            ControlNetModel,
@@ -541,8 +533,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            SD3ControlNetModel,
            SD3MultiControlNetModel,
            SD3Transformer2DModel,
-            SparseControlNetModel,
-            StableAudioDiTModel,
            T2IAdapter,
            T5FilmDecoder,
            Transformer2DModel,
@@ -639,7 +629,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    except OptionalDependencyNotAvailable:
        from .utils.dummy_torch_and_torchsde_objects import *  # noqa F403
    else:
-        from .schedulers import CosineDPMSolverMultistepScheduler, DPMSolverSDEScheduler
+        from .schedulers import DPMSolverSDEScheduler

    try:
        if not (is_torch_available() and is_transformers_available()):
@@ -653,10 +643,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            AmusedImg2ImgPipeline,
            AmusedInpaintPipeline,
            AmusedPipeline,
-            AnimateDiffControlNetPipeline,
            AnimateDiffPipeline,
            AnimateDiffSDXLPipeline,
-            AnimateDiffSparseControlNetPipeline,
            AnimateDiffVideoToVideoPipeline,
            AudioLDM2Pipeline,
            AudioLDM2ProjectionModel,
@@ -715,8 +703,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            SemanticStableDiffusionPipeline,
            ShapEImg2ImgPipeline,
            ShapEPipeline,
-            StableAudioPipeline,
-            StableAudioProjectionModel,
            StableCascadeCombinedPipeline,
            StableCascadeDecoderPipeline,
            StableCascadePriorPipeline,
@@ -9,7 +9,7 @@ deps = {
    "filelock": "filelock",
    "flax": "flax>=0.4.1",
    "hf-doc-builder": "hf-doc-builder>=0.3.0",
-    "huggingface-hub": "huggingface-hub>=0.24.5",
+    "huggingface-hub": "huggingface-hub>=0.23.2",
    "requests-mock": "requests-mock==1.10.0",
    "importlib_metadata": "importlib_metadata",
    "invisible-watermark": "invisible-watermark>=0.2.0",
@@ -55,6 +55,7 @@ _import_structure = {}

 if is_torch_available():
    _import_structure["single_file_model"] = ["FromOriginalModelMixin"]
+    _import_structure["transformer_sd3"] = ["SD3TransformerLoRALoadersMixin"]

    _import_structure["unet"] = ["UNet2DConditionLoadersMixin"]
    _import_structure["utils"] = ["AttnProcsLayers"]
@@ -65,7 +66,6 @@ if is_torch_available():
            "StableDiffusionLoraLoaderMixin",
            "SD3LoraLoaderMixin",
            "StableDiffusionXLLoraLoaderMixin",
-            "LoraLoaderMixin",
        ]
        _import_structure["textual_inversion"] = ["TextualInversionLoaderMixin"]
        _import_structure["ip_adapter"] = ["IPAdapterMixin"]
@@ -83,7 +83,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            from .ip_adapter import IPAdapterMixin
            from .lora_pipeline import (
                AmusedLoraLoaderMixin,
-                LoraLoaderMixin,
                SD3LoraLoaderMixin,
                StableDiffusionLoraLoaderMixin,
                StableDiffusionXLLoraLoaderMixin,
@@ -222,8 +222,7 @@ class IPAdapterMixin:

            # create feature extractor if it has not been registered to the pipeline yet
            if hasattr(self, "feature_extractor") and getattr(self, "feature_extractor", None) is None:
-                clip_image_size = self.image_encoder.config.image_size
-                feature_extractor = CLIPImageProcessor(size=clip_image_size, crop_size=clip_image_size)
+                feature_extractor = CLIPImageProcessor()
                self.register_modules(feature_extractor=feature_extractor)

        # load ip-adapter into unet
@@ -320,13 +319,7 @@ class IPAdapterMixin:

        # remove hidden encoder
        self.unet.encoder_hid_proj = None
-        self.unet.config.encoder_hid_dim_type = None
-
-        # Kolors: restore `encoder_hid_proj` with `text_encoder_hid_proj`
-        if hasattr(self.unet, "text_encoder_hid_proj") and self.unet.text_encoder_hid_proj is not None:
-            self.unet.encoder_hid_proj = self.unet.text_encoder_hid_proj
-            self.unet.text_encoder_hid_proj = None
-            self.unet.config.encoder_hid_dim_type = "text_proj"
+        self.config.encoder_hid_dim_type = None

        # restore original Unet attention processors layers
        attn_procs = {}
@@ -30,7 +30,6 @@ from .unet_loader_utils import _maybe_expand_lora_scales

 _SET_ADAPTER_SCALE_FN_MAPPING = {
    "UNet2DConditionModel": _maybe_expand_lora_scales,
-    "UNetMotionModel": _maybe_expand_lora_scales,
    "SD3Transformer2DModel": lambda model_cls, weights: weights,
 }

@@ -823,15 +823,6 @@ class UNet2DConditionLoadersMixin:
    def _load_ip_adapter_weights(self, state_dicts, low_cpu_mem_usage=False):
        if not isinstance(state_dicts, list):
            state_dicts = [state_dicts]
-
-        # Kolors Unet already has a `encoder_hid_proj`
-        if (
-            self.encoder_hid_proj is not None
-            and self.config.encoder_hid_dim_type == "text_proj"
-            and not hasattr(self, "text_encoder_hid_proj")
-        ):
-            self.text_encoder_hid_proj = self.encoder_hid_proj
-
        # Set encoder_hid_proj after loading ip_adapter weights,
        # because `IPAdapterPlusImageProjection` also has `attn_processors`.
        self.encoder_hid_proj = None
@@ -29,14 +29,12 @@ if is_torch_available():
    _import_structure["autoencoders.autoencoder_asym_kl"] = ["AsymmetricAutoencoderKL"]
    _import_structure["autoencoders.autoencoder_kl"] = ["AutoencoderKL"]
    _import_structure["autoencoders.autoencoder_kl_temporal_decoder"] = ["AutoencoderKLTemporalDecoder"]
-    _import_structure["autoencoders.autoencoder_oobleck"] = ["AutoencoderOobleck"]
    _import_structure["autoencoders.autoencoder_tiny"] = ["AutoencoderTiny"]
    _import_structure["autoencoders.consistency_decoder_vae"] = ["ConsistencyDecoderVAE"]
    _import_structure["autoencoders.vq_model"] = ["VQModel"]
    _import_structure["controlnet"] = ["ControlNetModel"]
    _import_structure["controlnet_hunyuan"] = ["HunyuanDiT2DControlNetModel", "HunyuanDiT2DMultiControlNetModel"]
    _import_structure["controlnet_sd3"] = ["SD3ControlNetModel", "SD3MultiControlNetModel"]
-    _import_structure["controlnet_sparsectrl"] = ["SparseControlNetModel"]
    _import_structure["controlnet_xs"] = ["ControlNetXSAdapter", "UNetControlNetXSModel"]
    _import_structure["embeddings"] = ["ImageProjection"]
    _import_structure["modeling_utils"] = ["ModelMixin"]
@@ -48,7 +46,6 @@ if is_torch_available():
    _import_structure["transformers.lumina_nextdit2d"] = ["LuminaNextDiT2DModel"]
    _import_structure["transformers.pixart_transformer_2d"] = ["PixArtTransformer2DModel"]
    _import_structure["transformers.prior_transformer"] = ["PriorTransformer"]
-    _import_structure["transformers.stable_audio_transformer"] = ["StableAudioDiTModel"]
    _import_structure["transformers.t5_film_transformer"] = ["T5FilmDecoder"]
    _import_structure["transformers.transformer_2d"] = ["Transformer2DModel"]
    _import_structure["transformers.transformer_sd3"] = ["SD3Transformer2DModel"]
@@ -77,7 +74,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            AsymmetricAutoencoderKL,
            AutoencoderKL,
            AutoencoderKLTemporalDecoder,
-            AutoencoderOobleck,
            AutoencoderTiny,
            ConsistencyDecoderVAE,
            VQModel,
@@ -85,7 +81,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .controlnet import ControlNetModel
        from .controlnet_hunyuan import HunyuanDiT2DControlNetModel, HunyuanDiT2DMultiControlNetModel
        from .controlnet_sd3 import SD3ControlNetModel, SD3MultiControlNetModel
-        from .controlnet_sparsectrl import SparseControlNetModel
        from .controlnet_xs import ControlNetXSAdapter, UNetControlNetXSModel
        from .embeddings import ImageProjection
        from .modeling_utils import ModelMixin
@@ -99,7 +94,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            PixArtTransformer2DModel,
            PriorTransformer,
            SD3Transformer2DModel,
-            StableAudioDiTModel,
            T5FilmDecoder,
            Transformer2DModel,
            TransformerTemporalModel,
@@ -123,28 +123,6 @@ class GEGLU(nn.Module):
            return hidden_states * self.gelu(gate)


-class SwiGLU(nn.Module):
-    r"""
-    A [variant](https://arxiv.org/abs/2002.05202) of the gated linear unit activation function. It's similar to `GEGLU`
-    but uses SiLU / Swish instead of GeLU.
-
-    Parameters:
-        dim_in (`int`): The number of channels in the input.
-        dim_out (`int`): The number of channels in the output.
-        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
-    """
-
-    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
-        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out * 2, bias=bias)
-        self.activation = nn.SiLU()
-
-    def forward(self, hidden_states):
-        hidden_states = self.proj(hidden_states)
-        hidden_states, gate = hidden_states.chunk(2, dim=-1)
-        return hidden_states * self.activation(gate)
-
-
 class ApproximateGELU(nn.Module):
    r"""
    The approximate form of the Gaussian Error Linear Unit (GELU). For more details, see section 2 of this
@@ -19,7 +19,7 @@ from torch import nn

 from ..utils import deprecate, logging
 from ..utils.torch_utils import maybe_allow_in_graph
-from .activations import GEGLU, GELU, ApproximateGELU, FP32SiLU, SwiGLU
+from .activations import GEGLU, GELU, ApproximateGELU, FP32SiLU
 from .attention_processor import Attention, JointAttnProcessor2_0
 from .embeddings import SinusoidalPositionalEmbedding
 from .normalization import AdaLayerNorm, AdaLayerNormContinuous, AdaLayerNormZero, RMSNorm
@@ -820,8 +820,6 @@ class FeedForward(nn.Module):
            act_fn = GEGLU(dim, inner_dim, bias=bias)
        elif activation_fn == "geglu-approximate":
            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
-        elif activation_fn == "swiglu":
-            act_fn = SwiGLU(dim, inner_dim, bias=bias)

        self.net = nn.ModuleList([])
        # project in
@@ -13,7 +13,7 @@
 # limitations under the License.
 import inspect
 import math
-from typing import Callable, List, Optional, Tuple, Union
+from typing import Callable, List, Optional, Union

 import torch
 import torch.nn.functional as F
@@ -49,10 +49,6 @@ class Attention(nn.Module):
            The number of channels in the encoder_hidden_states. If not given, defaults to `query_dim`.
        heads (`int`,  *optional*, defaults to 8):
            The number of heads to use for multi-head attention.
-        kv_heads (`int`,  *optional*, defaults to `None`):
-            The number of key and value heads to use for multi-head attention. Defaults to `heads`. If
-            `kv_heads=heads`, the model will use Multi Head Attention (MHA), if `kv_heads=1` the model will use Multi
-            Query Attention (MQA) otherwise GQA is used.
        dim_head (`int`,  *optional*, defaults to 64):
            The number of channels in each head.
        dropout (`float`, *optional*, defaults to 0.0):
@@ -1628,137 +1624,6 @@ class AttnProcessor2_0:
        return hidden_states


-class StableAudioAttnProcessor2_0:
-    r"""
-    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
-    used in the Stable Audio model. It applies rotary embedding on query and key vector, and allows MHA, GQA or MQA.
-    """
-
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError(
-                "StableAudioAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
-            )
-
-    def apply_partial_rotary_emb(
-        self,
-        x: torch.Tensor,
-        freqs_cis: Tuple[torch.Tensor],
-    ) -> torch.Tensor:
-        from .embeddings import apply_rotary_emb
-
-        rot_dim = freqs_cis[0].shape[-1]
-        x_to_rotate, x_unrotated = x[..., :rot_dim], x[..., rot_dim:]
-
-        x_rotated = apply_rotary_emb(x_to_rotate, freqs_cis, use_real=True, use_real_unbind_dim=-2)
-
-        out = torch.cat((x_rotated, x_unrotated), dim=-1)
-        return out
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        rotary_emb: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        from .embeddings import apply_rotary_emb
-
-        residual = hidden_states
-
-        input_ndim = hidden_states.ndim
-
-        if input_ndim == 4:
-            batch_size, channel, height, width = hidden_states.shape
-            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-        batch_size, sequence_length, _ = (
-            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-        )
-
-        if attention_mask is not None:
-            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-            # scaled_dot_product_attention expects attention_mask shape to be
-            # (batch, heads, source_length, target_length)
-            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-
-        query = attn.to_q(hidden_states)
-
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-        elif attn.norm_cross:
-            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        head_dim = query.shape[-1] // attn.heads
-        kv_heads = key.shape[-1] // head_dim
-
-        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-        key = key.view(batch_size, -1, kv_heads, head_dim).transpose(1, 2)
-        value = value.view(batch_size, -1, kv_heads, head_dim).transpose(1, 2)
-
-        if kv_heads != attn.heads:
-            # if GQA or MQA, repeat the key/value heads to reach the number of query heads.
-            heads_per_kv_head = attn.heads // kv_heads
-            key = torch.repeat_interleave(key, heads_per_kv_head, dim=1)
-            value = torch.repeat_interleave(value, heads_per_kv_head, dim=1)
-
-        if attn.norm_q is not None:
-            query = attn.norm_q(query)
-        if attn.norm_k is not None:
-            key = attn.norm_k(key)
-
-        # Apply RoPE if needed
-        if rotary_emb is not None:
-            query_dtype = query.dtype
-            key_dtype = key.dtype
-            query = query.to(torch.float32)
-            key = key.to(torch.float32)
-
-            rot_dim = rotary_emb[0].shape[-1]
-            query_to_rotate, query_unrotated = query[..., :rot_dim], query[..., rot_dim:]
-            query_rotated = apply_rotary_emb(query_to_rotate, rotary_emb, use_real=True, use_real_unbind_dim=-2)
-
-            query = torch.cat((query_rotated, query_unrotated), dim=-1)
-
-            if not attn.is_cross_attention:
-                key_to_rotate, key_unrotated = key[..., :rot_dim], key[..., rot_dim:]
-                key_rotated = apply_rotary_emb(key_to_rotate, rotary_emb, use_real=True, use_real_unbind_dim=-2)
-
-                key = torch.cat((key_rotated, key_unrotated), dim=-1)
-
-            query = query.to(query_dtype)
-            key = key.to(key_dtype)
-
-        # the output of sdp = (batch, num_heads, seq_len, head_dim)
-        # TODO: add support for attn.scale when we move to Torch 2.1
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-        )
-
-        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-        hidden_states = hidden_states.to(query.dtype)
-
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
-
-        if input_ndim == 4:
-            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-        if attn.residual_connection:
-            hidden_states = hidden_states + residual
-
-        hidden_states = hidden_states / attn.rescale_output_factor
-
-        return hidden_states
-
-
 class HunyuanAttnProcessor2_0:
    r"""
    Processor for implementing scaled dot-product attention (enabled by default if you're using PyTorch 2.0). This is
@@ -3097,6 +2962,12 @@ class PAGIdentitySelfAttnProcessor2_0:
        # perturbed path (identity attention)
        batch_size, sequence_length, _ = hidden_states_ptb.shape

+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
        if attn.group_norm is not None:
            hidden_states_ptb = attn.group_norm(hidden_states_ptb.transpose(1, 2)).transpose(1, 2)

@@ -3199,6 +3070,12 @@ class PAGCFGIdentitySelfAttnProcessor2_0:
        # perturbed path (identity attention)
        batch_size, sequence_length, _ = hidden_states_ptb.shape

+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            # scaled_dot_product_attention expects attention_mask shape to be
+            # (batch, heads, source_length, target_length)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+
        if attn.group_norm is not None:
            hidden_states_ptb = attn.group_norm(hidden_states_ptb.transpose(1, 2)).transpose(1, 2)

@@ -1,7 +1,6 @@
 from .autoencoder_asym_kl import AsymmetricAutoencoderKL
 from .autoencoder_kl import AutoencoderKL
 from .autoencoder_kl_temporal_decoder import AutoencoderKLTemporalDecoder
-from .autoencoder_oobleck import AutoencoderOobleck
 from .autoencoder_tiny import AutoencoderTiny
 from .consistency_decoder_vae import ConsistencyDecoderVAE
 from .vq_model import VQModel
@@ -1,464 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import math
-from dataclasses import dataclass
-from typing import Optional, Tuple, Union
-
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.nn.utils import weight_norm
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...utils import BaseOutput
-from ...utils.accelerate_utils import apply_forward_hook
-from ...utils.torch_utils import randn_tensor
-from ..modeling_utils import ModelMixin
-
-
-class Snake1d(nn.Module):
-    """
-    A 1-dimensional Snake activation function module.
-    """
-
-    def __init__(self, hidden_dim, logscale=True):
-        super().__init__()
-        self.alpha = nn.Parameter(torch.zeros(1, hidden_dim, 1))
-        self.beta = nn.Parameter(torch.zeros(1, hidden_dim, 1))
-
-        self.alpha.requires_grad = True
-        self.beta.requires_grad = True
-        self.logscale = logscale
-
-    def forward(self, hidden_states):
-        shape = hidden_states.shape
-
-        alpha = self.alpha if not self.logscale else torch.exp(self.alpha)
-        beta = self.beta if not self.logscale else torch.exp(self.beta)
-
-        hidden_states = hidden_states.reshape(shape[0], shape[1], -1)
-        hidden_states = hidden_states + (beta + 1e-9).reciprocal() * torch.sin(alpha * hidden_states).pow(2)
-        hidden_states = hidden_states.reshape(shape)
-        return hidden_states
-
-
-class OobleckResidualUnit(nn.Module):
-    """
-    A residual unit composed of Snake1d and weight-normalized Conv1d layers with dilations.
-    """
-
-    def __init__(self, dimension: int = 16, dilation: int = 1):
-        super().__init__()
-        pad = ((7 - 1) * dilation) // 2
-
-        self.snake1 = Snake1d(dimension)
-        self.conv1 = weight_norm(nn.Conv1d(dimension, dimension, kernel_size=7, dilation=dilation, padding=pad))
-        self.snake2 = Snake1d(dimension)
-        self.conv2 = weight_norm(nn.Conv1d(dimension, dimension, kernel_size=1))
-
-    def forward(self, hidden_state):
-        """
-        Forward pass through the residual unit.
-
-        Args:
-            hidden_state (`torch.Tensor` of shape `(batch_size, channels, time_steps)`):
-                Input tensor .
-
-        Returns:
-            output_tensor (`torch.Tensor` of shape `(batch_size, channels, time_steps)`)
-                Input tensor after passing through the residual unit.
-        """
-        output_tensor = hidden_state
-        output_tensor = self.conv1(self.snake1(output_tensor))
-        output_tensor = self.conv2(self.snake2(output_tensor))
-
-        padding = (hidden_state.shape[-1] - output_tensor.shape[-1]) // 2
-        if padding > 0:
-            hidden_state = hidden_state[..., padding:-padding]
-        output_tensor = hidden_state + output_tensor
-        return output_tensor
-
-
-class OobleckEncoderBlock(nn.Module):
-    """Encoder block used in Oobleck encoder."""
-
-    def __init__(self, input_dim, output_dim, stride: int = 1):
-        super().__init__()
-
-        self.res_unit1 = OobleckResidualUnit(input_dim, dilation=1)
-        self.res_unit2 = OobleckResidualUnit(input_dim, dilation=3)
-        self.res_unit3 = OobleckResidualUnit(input_dim, dilation=9)
-        self.snake1 = Snake1d(input_dim)
-        self.conv1 = weight_norm(
-            nn.Conv1d(input_dim, output_dim, kernel_size=2 * stride, stride=stride, padding=math.ceil(stride / 2))
-        )
-
-    def forward(self, hidden_state):
-        hidden_state = self.res_unit1(hidden_state)
-        hidden_state = self.res_unit2(hidden_state)
-        hidden_state = self.snake1(self.res_unit3(hidden_state))
-        hidden_state = self.conv1(hidden_state)
-
-        return hidden_state
-
-
-class OobleckDecoderBlock(nn.Module):
-    """Decoder block used in Oobleck decoder."""
-
-    def __init__(self, input_dim, output_dim, stride: int = 1):
-        super().__init__()
-
-        self.snake1 = Snake1d(input_dim)
-        self.conv_t1 = weight_norm(
-            nn.ConvTranspose1d(
-                input_dim,
-                output_dim,
-                kernel_size=2 * stride,
-                stride=stride,
-                padding=math.ceil(stride / 2),
-            )
-        )
-        self.res_unit1 = OobleckResidualUnit(output_dim, dilation=1)
-        self.res_unit2 = OobleckResidualUnit(output_dim, dilation=3)
-        self.res_unit3 = OobleckResidualUnit(output_dim, dilation=9)
-
-    def forward(self, hidden_state):
-        hidden_state = self.snake1(hidden_state)
-        hidden_state = self.conv_t1(hidden_state)
-        hidden_state = self.res_unit1(hidden_state)
-        hidden_state = self.res_unit2(hidden_state)
-        hidden_state = self.res_unit3(hidden_state)
-
-        return hidden_state
-
-
-class OobleckDiagonalGaussianDistribution(object):
-    def __init__(self, parameters: torch.Tensor, deterministic: bool = False):
-        self.parameters = parameters
-        self.mean, self.scale = parameters.chunk(2, dim=1)
-        self.std = nn.functional.softplus(self.scale) + 1e-4
-        self.var = self.std * self.std
-        self.logvar = torch.log(self.var)
-        self.deterministic = deterministic
-
-    def sample(self, generator: Optional[torch.Generator] = None) -> torch.Tensor:
-        # make sure sample is on the same device as the parameters and has same dtype
-        sample = randn_tensor(
-            self.mean.shape,
-            generator=generator,
-            device=self.parameters.device,
-            dtype=self.parameters.dtype,
-        )
-        x = self.mean + self.std * sample
-        return x
-
-    def kl(self, other: "OobleckDiagonalGaussianDistribution" = None) -> torch.Tensor:
-        if self.deterministic:
-            return torch.Tensor([0.0])
-        else:
-            if other is None:
-                return (self.mean * self.mean + self.var - self.logvar - 1.0).sum(1).mean()
-            else:
-                normalized_diff = torch.pow(self.mean - other.mean, 2) / other.var
-                var_ratio = self.var / other.var
-                logvar_diff = self.logvar - other.logvar
-
-                kl = normalized_diff + var_ratio + logvar_diff - 1
-
-                kl = kl.sum(1).mean()
-                return kl
-
-    def mode(self) -> torch.Tensor:
-        return self.mean
-
-
-@dataclass
-class AutoencoderOobleckOutput(BaseOutput):
-    """
-    Output of AutoencoderOobleck encoding method.
-
-    Args:
-        latent_dist (`OobleckDiagonalGaussianDistribution`):
-            Encoded outputs of `Encoder` represented as the mean and standard deviation of
-            `OobleckDiagonalGaussianDistribution`. `OobleckDiagonalGaussianDistribution` allows for sampling latents
-            from the distribution.
-    """
-
-    latent_dist: "OobleckDiagonalGaussianDistribution"  # noqa: F821
-
-
-@dataclass
-class OobleckDecoderOutput(BaseOutput):
-    r"""
-    Output of decoding method.
-
-    Args:
-        sample (`torch.Tensor` of shape `(batch_size, audio_channels, sequence_length)`):
-            The decoded output sample from the last layer of the model.
-    """
-
-    sample: torch.Tensor
-
-
-class OobleckEncoder(nn.Module):
-    """Oobleck Encoder"""
-
-    def __init__(self, encoder_hidden_size, audio_channels, downsampling_ratios, channel_multiples):
-        super().__init__()
-
-        strides = downsampling_ratios
-        channel_multiples = [1] + channel_multiples
-
-        # Create first convolution
-        self.conv1 = weight_norm(nn.Conv1d(audio_channels, encoder_hidden_size, kernel_size=7, padding=3))
-
-        self.block = []
-        # Create EncoderBlocks that double channels as they downsample by `stride`
-        for stride_index, stride in enumerate(strides):
-            self.block += [
-                OobleckEncoderBlock(
-                    input_dim=encoder_hidden_size * channel_multiples[stride_index],
-                    output_dim=encoder_hidden_size * channel_multiples[stride_index + 1],
-                    stride=stride,
-                )
-            ]
-
-        self.block = nn.ModuleList(self.block)
-        d_model = encoder_hidden_size * channel_multiples[-1]
-        self.snake1 = Snake1d(d_model)
-        self.conv2 = weight_norm(nn.Conv1d(d_model, encoder_hidden_size, kernel_size=3, padding=1))
-
-    def forward(self, hidden_state):
-        hidden_state = self.conv1(hidden_state)
-
-        for module in self.block:
-            hidden_state = module(hidden_state)
-
-        hidden_state = self.snake1(hidden_state)
-        hidden_state = self.conv2(hidden_state)
-
-        return hidden_state
-
-
-class OobleckDecoder(nn.Module):
-    """Oobleck Decoder"""
-
-    def __init__(self, channels, input_channels, audio_channels, upsampling_ratios, channel_multiples):
-        super().__init__()
-
-        strides = upsampling_ratios
-        channel_multiples = [1] + channel_multiples
-
-        # Add first conv layer
-        self.conv1 = weight_norm(nn.Conv1d(input_channels, channels * channel_multiples[-1], kernel_size=7, padding=3))
-
-        # Add upsampling + MRF blocks
-        block = []
-        for stride_index, stride in enumerate(strides):
-            block += [
-                OobleckDecoderBlock(
-                    input_dim=channels * channel_multiples[len(strides) - stride_index],
-                    output_dim=channels * channel_multiples[len(strides) - stride_index - 1],
-                    stride=stride,
-                )
-            ]
-
-        self.block = nn.ModuleList(block)
-        output_dim = channels
-        self.snake1 = Snake1d(output_dim)
-        self.conv2 = weight_norm(nn.Conv1d(channels, audio_channels, kernel_size=7, padding=3, bias=False))
-
-    def forward(self, hidden_state):
-        hidden_state = self.conv1(hidden_state)
-
-        for layer in self.block:
-            hidden_state = layer(hidden_state)
-
-        hidden_state = self.snake1(hidden_state)
-        hidden_state = self.conv2(hidden_state)
-
-        return hidden_state
-
-
-class AutoencoderOobleck(ModelMixin, ConfigMixin):
-    r"""
-    An autoencoder for encoding waveforms into latents and decoding latent representations into waveforms. First
-    introduced in Stable Audio.
-
-    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
-    for all models (such as downloading or saving).
-
-    Parameters:
-        encoder_hidden_size (`int`, *optional*, defaults to 128):
-            Intermediate representation dimension for the encoder.
-        downsampling_ratios (`List[int]`, *optional*, defaults to `[2, 4, 4, 8, 8]`):
-            Ratios for downsampling in the encoder. These are used in reverse order for upsampling in the decoder.
-        channel_multiples (`List[int]`, *optional*, defaults to `[1, 2, 4, 8, 16]`):
-            Multiples used to determine the hidden sizes of the hidden layers.
-        decoder_channels (`int`, *optional*, defaults to 128):
-            Intermediate representation dimension for the decoder.
-        decoder_input_channels (`int`, *optional*, defaults to 64):
-            Input dimension for the decoder. Corresponds to the latent dimension.
-        audio_channels (`int`, *optional*, defaults to 2):
-            Number of channels in the audio data. Either 1 for mono or 2 for stereo.
-        sampling_rate (`int`, *optional*, defaults to 44100):
-            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
-    """
-
-    _supports_gradient_checkpointing = False
-
-    @register_to_config
-    def __init__(
-        self,
-        encoder_hidden_size=128,
-        downsampling_ratios=[2, 4, 4, 8, 8],
-        channel_multiples=[1, 2, 4, 8, 16],
-        decoder_channels=128,
-        decoder_input_channels=64,
-        audio_channels=2,
-        sampling_rate=44100,
-    ):
-        super().__init__()
-
-        self.encoder_hidden_size = encoder_hidden_size
-        self.downsampling_ratios = downsampling_ratios
-        self.decoder_channels = decoder_channels
-        self.upsampling_ratios = downsampling_ratios[::-1]
-        self.hop_length = int(np.prod(downsampling_ratios))
-        self.sampling_rate = sampling_rate
-
-        self.encoder = OobleckEncoder(
-            encoder_hidden_size=encoder_hidden_size,
-            audio_channels=audio_channels,
-            downsampling_ratios=downsampling_ratios,
-            channel_multiples=channel_multiples,
-        )
-
-        self.decoder = OobleckDecoder(
-            channels=decoder_channels,
-            input_channels=decoder_input_channels,
-            audio_channels=audio_channels,
-            upsampling_ratios=self.upsampling_ratios,
-            channel_multiples=channel_multiples,
-        )
-
-        self.use_slicing = False
-
-    def enable_slicing(self):
-        r"""
-        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
-        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
-        """
-        self.use_slicing = True
-
-    def disable_slicing(self):
-        r"""
-        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
-        decoding in one step.
-        """
-        self.use_slicing = False
-
-    @apply_forward_hook
-    def encode(
-        self, x: torch.Tensor, return_dict: bool = True
-    ) -> Union[AutoencoderOobleckOutput, Tuple[OobleckDiagonalGaussianDistribution]]:
-        """
-        Encode a batch of images into latents.
-
-        Args:
-            x (`torch.Tensor`): Input batch of images.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
-
-        Returns:
-                The latent representations of the encoded images. If `return_dict` is True, a
-                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
-        """
-        if self.use_slicing and x.shape[0] > 1:
-            encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
-            h = torch.cat(encoded_slices)
-        else:
-            h = self.encoder(x)
-
-        posterior = OobleckDiagonalGaussianDistribution(h)
-
-        if not return_dict:
-            return (posterior,)
-
-        return AutoencoderOobleckOutput(latent_dist=posterior)
-
-    def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[OobleckDecoderOutput, torch.Tensor]:
-        dec = self.decoder(z)
-
-        if not return_dict:
-            return (dec,)
-
-        return OobleckDecoderOutput(sample=dec)
-
-    @apply_forward_hook
-    def decode(
-        self, z: torch.FloatTensor, return_dict: bool = True, generator=None
-    ) -> Union[OobleckDecoderOutput, torch.FloatTensor]:
-        """
-        Decode a batch of images.
-
-        Args:
-            z (`torch.Tensor`): Input batch of latent vectors.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether to return a [`~models.vae.OobleckDecoderOutput`] instead of a plain tuple.
-
-        Returns:
-            [`~models.vae.OobleckDecoderOutput`] or `tuple`:
-                If return_dict is True, a [`~models.vae.OobleckDecoderOutput`] is returned, otherwise a plain `tuple`
-                is returned.
-
-        """
-        if self.use_slicing and z.shape[0] > 1:
-            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
-            decoded = torch.cat(decoded_slices)
-        else:
-            decoded = self._decode(z).sample
-
-        if not return_dict:
-            return (decoded,)
-
-        return OobleckDecoderOutput(sample=decoded)
-
-    def forward(
-        self,
-        sample: torch.Tensor,
-        sample_posterior: bool = False,
-        return_dict: bool = True,
-        generator: Optional[torch.Generator] = None,
-    ) -> Union[OobleckDecoderOutput, torch.Tensor]:
-        r"""
-        Args:
-            sample (`torch.Tensor`): Input sample.
-            sample_posterior (`bool`, *optional*, defaults to `False`):
-                Whether to sample from the posterior.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`OobleckDecoderOutput`] instead of a plain tuple.
-        """
-        x = sample
-        posterior = self.encode(x).latent_dist
-        if sample_posterior:
-            z = posterior.sample(generator=generator)
-        else:
-            z = posterior.mode()
-        dec = self.decode(z).sample
-
-        if not return_dict:
-            return (dec,)
-
-        return OobleckDecoderOutput(sample=dec)
@@ -830,6 +830,7 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
                sample = self.mid_block(sample, emb)

        # 5. Control net blocks
+
        controlnet_down_block_res_samples = ()

        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
@@ -1,791 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
-
-import torch
-from torch import nn
-from torch.nn import functional as F
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from ..utils import BaseOutput, logging
-from .attention_processor import (
-    ADDED_KV_ATTENTION_PROCESSORS,
-    CROSS_ATTENTION_PROCESSORS,
-    AttentionProcessor,
-    AttnAddedKVProcessor,
-    AttnProcessor,
-)
-from .embeddings import TimestepEmbedding, Timesteps
-from .modeling_utils import ModelMixin
-from .unets.unet_2d_blocks import UNetMidBlock2DCrossAttn
-from .unets.unet_2d_condition import UNet2DConditionModel
-from .unets.unet_3d_blocks import (
-    CrossAttnDownBlockMotion,
-    DownBlockMotion,
-)
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-@dataclass
-class SparseControlNetOutput(BaseOutput):
-    """
-    The output of [`SparseControlNetModel`].
-
-    Args:
-        down_block_res_samples (`tuple[torch.Tensor]`):
-            A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should
-            be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be
-            used to condition the original UNet's downsampling activations.
-        mid_down_block_re_sample (`torch.Tensor`):
-            The activation of the middle block (the lowest sample resolution). Each tensor should be of shape
-            `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`.
-            Output can be used to condition the original UNet's middle block activation.
-    """
-
-    down_block_res_samples: Tuple[torch.Tensor]
-    mid_block_res_sample: torch.Tensor
-
-
-class SparseControlNetConditioningEmbedding(nn.Module):
-    def __init__(
-        self,
-        conditioning_embedding_channels: int,
-        conditioning_channels: int = 3,
-        block_out_channels: Tuple[int, ...] = (16, 32, 96, 256),
-    ):
-        super().__init__()
-
-        self.conv_in = nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
-        self.blocks = nn.ModuleList([])
-
-        for i in range(len(block_out_channels) - 1):
-            channel_in = block_out_channels[i]
-            channel_out = block_out_channels[i + 1]
-            self.blocks.append(nn.Conv2d(channel_in, channel_in, kernel_size=3, padding=1))
-            self.blocks.append(nn.Conv2d(channel_in, channel_out, kernel_size=3, padding=1, stride=2))
-
-        self.conv_out = zero_module(
-            nn.Conv2d(block_out_channels[-1], conditioning_embedding_channels, kernel_size=3, padding=1)
-        )
-
-    def forward(self, conditioning: torch.Tensor) -> torch.Tensor:
-        embedding = self.conv_in(conditioning)
-        embedding = F.silu(embedding)
-
-        for block in self.blocks:
-            embedding = block(embedding)
-            embedding = F.silu(embedding)
-
-        embedding = self.conv_out(embedding)
-        return embedding
-
-
-class SparseControlNetModel(ModelMixin, ConfigMixin):
-    """
-    A SparseControlNet model as described in [SparseCtrl: Adding Sparse Controls to Text-to-Video Diffusion
-    Models](https://arxiv.org/abs/2311.16933).
-
-    Args:
-        in_channels (`int`, defaults to 4):
-            The number of channels in the input sample.
-        conditioning_channels (`int`, defaults to 4):
-            The number of input channels in the controlnet conditional embedding module. If
-            `concat_condition_embedding` is True, the value provided here is incremented by 1.
-        flip_sin_to_cos (`bool`, defaults to `True`):
-            Whether to flip the sin to cos in the time embedding.
-        freq_shift (`int`, defaults to 0):
-            The frequency shift to apply to the time embedding.
-        down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
-            The tuple of downsample blocks to use.
-        only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`):
-        block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`):
-            The tuple of output channels for each block.
-        layers_per_block (`int`, defaults to 2):
-            The number of layers per block.
-        downsample_padding (`int`, defaults to 1):
-            The padding to use for the downsampling convolution.
-        mid_block_scale_factor (`float`, defaults to 1):
-            The scale factor to use for the mid block.
-        act_fn (`str`, defaults to "silu"):
-            The activation function to use.
-        norm_num_groups (`int`, *optional*, defaults to 32):
-            The number of groups to use for the normalization. If None, normalization and activation layers is skipped
-            in post-processing.
-        norm_eps (`float`, defaults to 1e-5):
-            The epsilon to use for the normalization.
-        cross_attention_dim (`int`, defaults to 1280):
-            The dimension of the cross attention features.
-        transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
-            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
-            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
-            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
-        transformer_layers_per_mid_block (`int` or `Tuple[int]`, *optional*, defaults to 1):
-            The number of transformer layers to use in each layer in the middle block.
-        attention_head_dim (`int` or `Tuple[int]`, defaults to 8):
-            The dimension of the attention heads.
-        num_attention_heads (`int` or `Tuple[int]`, *optional*):
-            The number of heads to use for multi-head attention.
-        use_linear_projection (`bool`, defaults to `False`):
-        upcast_attention (`bool`, defaults to `False`):
-        resnet_time_scale_shift (`str`, defaults to `"default"`):
-            Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`.
-        conditioning_embedding_out_channels (`Tuple[int]`, defaults to `(16, 32, 96, 256)`):
-            The tuple of output channel for each block in the `conditioning_embedding` layer.
-        global_pool_conditions (`bool`, defaults to `False`):
-            TODO(Patrick) - unused parameter
-        controlnet_conditioning_channel_order (`str`, defaults to `rgb`):
-        motion_max_seq_length (`int`, defaults to `32`):
-            The maximum sequence length to use in the motion module.
-        motion_num_attention_heads (`int` or `Tuple[int]`, defaults to `8`):
-            The number of heads to use in each attention layer of the motion module.
-        concat_conditioning_mask (`bool`, defaults to `True`):
-        use_simplified_condition_embedding (`bool`, defaults to `True`):
-    """
-
-    _supports_gradient_checkpointing = True
-
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 4,
-        conditioning_channels: int = 4,
-        flip_sin_to_cos: bool = True,
-        freq_shift: int = 0,
-        down_block_types: Tuple[str, ...] = (
-            "CrossAttnDownBlockMotion",
-            "CrossAttnDownBlockMotion",
-            "CrossAttnDownBlockMotion",
-            "DownBlockMotion",
-        ),
-        only_cross_attention: Union[bool, Tuple[bool]] = False,
-        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
-        layers_per_block: int = 2,
-        downsample_padding: int = 1,
-        mid_block_scale_factor: float = 1,
-        act_fn: str = "silu",
-        norm_num_groups: Optional[int] = 32,
-        norm_eps: float = 1e-5,
-        cross_attention_dim: int = 768,
-        transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
-        transformer_layers_per_mid_block: Optional[Union[int, Tuple[int]]] = None,
-        temporal_transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
-        attention_head_dim: Union[int, Tuple[int, ...]] = 8,
-        num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None,
-        use_linear_projection: bool = False,
-        upcast_attention: bool = False,
-        resnet_time_scale_shift: str = "default",
-        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
-        global_pool_conditions: bool = False,
-        controlnet_conditioning_channel_order: str = "rgb",
-        motion_max_seq_length: int = 32,
-        motion_num_attention_heads: int = 8,
-        concat_conditioning_mask: bool = True,
-        use_simplified_condition_embedding: bool = True,
-    ):
-        super().__init__()
-        self.use_simplified_condition_embedding = use_simplified_condition_embedding
-
-        # If `num_attention_heads` is not defined (which is the case for most models)
-        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
-        # The reason for this behavior is to correct for incorrectly named variables that were introduced
-        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
-        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
-        # which is why we correct for the naming here.
-        num_attention_heads = num_attention_heads or attention_head_dim
-
-        # Check inputs
-        if len(block_out_channels) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
-            )
-
-        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
-            )
-
-        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
-            )
-
-        if isinstance(transformer_layers_per_block, int):
-            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
-        if isinstance(temporal_transformer_layers_per_block, int):
-            temporal_transformer_layers_per_block = [temporal_transformer_layers_per_block] * len(down_block_types)
-
-        # input
-        conv_in_kernel = 3
-        conv_in_padding = (conv_in_kernel - 1) // 2
-        self.conv_in = nn.Conv2d(
-            in_channels, block_out_channels[0], kernel_size=conv_in_kernel, padding=conv_in_padding
-        )
-
-        if concat_conditioning_mask:
-            conditioning_channels = conditioning_channels + 1
-
-        self.concat_conditioning_mask = concat_conditioning_mask
-
-        # control net conditioning embedding
-        if use_simplified_condition_embedding:
-            self.controlnet_cond_embedding = zero_module(
-                nn.Conv2d(conditioning_channels, block_out_channels[0], kernel_size=3, padding=1)
-            )
-        else:
-            self.controlnet_cond_embedding = SparseControlNetConditioningEmbedding(
-                conditioning_embedding_channels=block_out_channels[0],
-                block_out_channels=conditioning_embedding_out_channels,
-                conditioning_channels=conditioning_channels,
-            )
-
-        # time
-        time_embed_dim = block_out_channels[0] * 4
-        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-        timestep_input_dim = block_out_channels[0]
-
-        self.time_embedding = TimestepEmbedding(
-            timestep_input_dim,
-            time_embed_dim,
-            act_fn=act_fn,
-        )
-
-        self.down_blocks = nn.ModuleList([])
-        self.controlnet_down_blocks = nn.ModuleList([])
-
-        if isinstance(cross_attention_dim, int):
-            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)
-
-        if isinstance(only_cross_attention, bool):
-            only_cross_attention = [only_cross_attention] * len(down_block_types)
-
-        if isinstance(attention_head_dim, int):
-            attention_head_dim = (attention_head_dim,) * len(down_block_types)
-
-        if isinstance(num_attention_heads, int):
-            num_attention_heads = (num_attention_heads,) * len(down_block_types)
-
-        if isinstance(motion_num_attention_heads, int):
-            motion_num_attention_heads = (motion_num_attention_heads,) * len(down_block_types)
-
-        # down
-        output_channel = block_out_channels[0]
-
-        controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-        controlnet_block = zero_module(controlnet_block)
-        self.controlnet_down_blocks.append(controlnet_block)
-
-        for i, down_block_type in enumerate(down_block_types):
-            input_channel = output_channel
-            output_channel = block_out_channels[i]
-            is_final_block = i == len(block_out_channels) - 1
-
-            if down_block_type == "CrossAttnDownBlockMotion":
-                down_block = CrossAttnDownBlockMotion(
-                    in_channels=input_channel,
-                    out_channels=output_channel,
-                    temb_channels=time_embed_dim,
-                    dropout=0,
-                    num_layers=layers_per_block,
-                    transformer_layers_per_block=transformer_layers_per_block[i],
-                    resnet_eps=norm_eps,
-                    resnet_time_scale_shift=resnet_time_scale_shift,
-                    resnet_act_fn=act_fn,
-                    resnet_groups=norm_num_groups,
-                    resnet_pre_norm=True,
-                    num_attention_heads=num_attention_heads[i],
-                    cross_attention_dim=cross_attention_dim[i],
-                    add_downsample=not is_final_block,
-                    dual_cross_attention=False,
-                    use_linear_projection=use_linear_projection,
-                    only_cross_attention=only_cross_attention[i],
-                    upcast_attention=upcast_attention,
-                    temporal_num_attention_heads=motion_num_attention_heads[i],
-                    temporal_max_seq_length=motion_max_seq_length,
-                    temporal_transformer_layers_per_block=temporal_transformer_layers_per_block[i],
-                    temporal_double_self_attention=False,
-                )
-            elif down_block_type == "DownBlockMotion":
-                down_block = DownBlockMotion(
-                    in_channels=input_channel,
-                    out_channels=output_channel,
-                    temb_channels=time_embed_dim,
-                    dropout=0,
-                    num_layers=layers_per_block,
-                    resnet_eps=norm_eps,
-                    resnet_time_scale_shift=resnet_time_scale_shift,
-                    resnet_act_fn=act_fn,
-                    resnet_groups=norm_num_groups,
-                    resnet_pre_norm=True,
-                    add_downsample=not is_final_block,
-                    temporal_num_attention_heads=motion_num_attention_heads[i],
-                    temporal_max_seq_length=motion_max_seq_length,
-                    temporal_double_self_attention=False,
-                    temporal_transformer_layers_per_block=temporal_transformer_layers_per_block[i],
-                )
-            else:
-                raise ValueError(
-                    "Invalid `block_type` encountered. Must be one of `CrossAttnDownBlockMotion` or `DownBlockMotion`"
-                )
-
-            self.down_blocks.append(down_block)
-
-            for _ in range(layers_per_block):
-                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-                controlnet_block = zero_module(controlnet_block)
-                self.controlnet_down_blocks.append(controlnet_block)
-
-            if not is_final_block:
-                controlnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1)
-                controlnet_block = zero_module(controlnet_block)
-                self.controlnet_down_blocks.append(controlnet_block)
-
-        # mid
-        mid_block_channels = block_out_channels[-1]
-
-        controlnet_block = nn.Conv2d(mid_block_channels, mid_block_channels, kernel_size=1)
-        controlnet_block = zero_module(controlnet_block)
-        self.controlnet_mid_block = controlnet_block
-
-        if transformer_layers_per_mid_block is None:
-            transformer_layers_per_mid_block = (
-                transformer_layers_per_block[-1] if isinstance(transformer_layers_per_block[-1], int) else 1
-            )
-
-        self.mid_block = UNetMidBlock2DCrossAttn(
-            in_channels=mid_block_channels,
-            temb_channels=time_embed_dim,
-            dropout=0,
-            num_layers=1,
-            transformer_layers_per_block=transformer_layers_per_mid_block,
-            resnet_eps=norm_eps,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            resnet_act_fn=act_fn,
-            resnet_groups=norm_num_groups,
-            resnet_pre_norm=True,
-            num_attention_heads=num_attention_heads[-1],
-            output_scale_factor=mid_block_scale_factor,
-            cross_attention_dim=cross_attention_dim[-1],
-            dual_cross_attention=False,
-            use_linear_projection=use_linear_projection,
-            upcast_attention=upcast_attention,
-            attention_type="default",
-        )
-
-    @classmethod
-    def from_unet(
-        cls,
-        unet: UNet2DConditionModel,
-        controlnet_conditioning_channel_order: str = "rgb",
-        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
-        load_weights_from_unet: bool = True,
-        conditioning_channels: int = 3,
-    ) -> "SparseControlNetModel":
-        r"""
-        Instantiate a [`SparseControlNetModel`] from [`UNet2DConditionModel`].
-
-        Parameters:
-            unet (`UNet2DConditionModel`):
-                The UNet model weights to copy to the [`SparseControlNetModel`]. All configuration options are also
-                copied where applicable.
-        """
-        transformer_layers_per_block = (
-            unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1
-        )
-        down_block_types = unet.config.down_block_types
-
-        for i in range(len(down_block_types)):
-            if "CrossAttn" in down_block_types[i]:
-                down_block_types[i] = "CrossAttnDownBlockMotion"
-            elif "Down" in down_block_types[i]:
-                down_block_types[i] = "DownBlockMotion"
-            else:
-                raise ValueError("Invalid `block_type` encountered. Must be a cross-attention or down block")
-
-        controlnet = cls(
-            in_channels=unet.config.in_channels,
-            conditioning_channels=conditioning_channels,
-            flip_sin_to_cos=unet.config.flip_sin_to_cos,
-            freq_shift=unet.config.freq_shift,
-            down_block_types=unet.config.down_block_types,
-            only_cross_attention=unet.config.only_cross_attention,
-            block_out_channels=unet.config.block_out_channels,
-            layers_per_block=unet.config.layers_per_block,
-            downsample_padding=unet.config.downsample_padding,
-            mid_block_scale_factor=unet.config.mid_block_scale_factor,
-            act_fn=unet.config.act_fn,
-            norm_num_groups=unet.config.norm_num_groups,
-            norm_eps=unet.config.norm_eps,
-            cross_attention_dim=unet.config.cross_attention_dim,
-            transformer_layers_per_block=transformer_layers_per_block,
-            attention_head_dim=unet.config.attention_head_dim,
-            num_attention_heads=unet.config.num_attention_heads,
-            use_linear_projection=unet.config.use_linear_projection,
-            upcast_attention=unet.config.upcast_attention,
-            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
-            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
-            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
-        )
-
-        if load_weights_from_unet:
-            controlnet.conv_in.load_state_dict(unet.conv_in.state_dict(), strict=False)
-            controlnet.time_proj.load_state_dict(unet.time_proj.state_dict(), strict=False)
-            controlnet.time_embedding.load_state_dict(unet.time_embedding.state_dict(), strict=False)
-            controlnet.down_blocks.load_state_dict(unet.down_blocks.state_dict(), strict=False)
-            controlnet.mid_block.load_state_dict(unet.mid_block.state_dict(), strict=False)
-
-        return controlnet
-
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
-    def set_default_attn_processor(self):
-        """
-        Disables custom attention processors and sets the default attention implementation.
-        """
-        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
-            processor = AttnAddedKVProcessor()
-        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
-            processor = AttnProcessor()
-        else:
-            raise ValueError(
-                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
-            )
-
-        self.set_attn_processor(processor)
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice
-    def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None:
-        r"""
-        Enable sliced attention computation.
-
-        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
-        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
-
-        Args:
-            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
-                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
-                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
-                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
-                must be a multiple of `slice_size`.
-        """
-        sliceable_head_dims = []
-
-        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
-            if hasattr(module, "set_attention_slice"):
-                sliceable_head_dims.append(module.sliceable_head_dim)
-
-            for child in module.children():
-                fn_recursive_retrieve_sliceable_dims(child)
-
-        # retrieve number of attention layers
-        for module in self.children():
-            fn_recursive_retrieve_sliceable_dims(module)
-
-        num_sliceable_layers = len(sliceable_head_dims)
-
-        if slice_size == "auto":
-            # half the attention head size is usually a good trade-off between
-            # speed and memory
-            slice_size = [dim // 2 for dim in sliceable_head_dims]
-        elif slice_size == "max":
-            # make smallest slice possible
-            slice_size = num_sliceable_layers * [1]
-
-        slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
-
-        if len(slice_size) != len(sliceable_head_dims):
-            raise ValueError(
-                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
-                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
-            )
-
-        for i in range(len(slice_size)):
-            size = slice_size[i]
-            dim = sliceable_head_dims[i]
-            if size is not None and size > dim:
-                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
-
-        # Recursively walk through all the children.
-        # Any children which exposes the set_attention_slice method
-        # gets the message
-        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
-            if hasattr(module, "set_attention_slice"):
-                module.set_attention_slice(slice_size.pop())
-
-            for child in module.children():
-                fn_recursive_set_attention_slice(child, slice_size)
-
-        reversed_slice_size = list(reversed(slice_size))
-        for module in self.children():
-            fn_recursive_set_attention_slice(module, reversed_slice_size)
-
-    def _set_gradient_checkpointing(self, module, value: bool = False) -> None:
-        if isinstance(module, (CrossAttnDownBlockMotion, DownBlockMotion, UNetMidBlock2DCrossAttn)):
-            module.gradient_checkpointing = value
-
-    def forward(
-        self,
-        sample: torch.Tensor,
-        timestep: Union[torch.Tensor, float, int],
-        encoder_hidden_states: torch.Tensor,
-        controlnet_cond: torch.Tensor,
-        conditioning_scale: float = 1.0,
-        timestep_cond: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        conditioning_mask: Optional[torch.Tensor] = None,
-        guess_mode: bool = False,
-        return_dict: bool = True,
-    ) -> Union[SparseControlNetOutput, Tuple[Tuple[torch.Tensor, ...], torch.Tensor]]:
-        """
-        The [`SparseControlNetModel`] forward method.
-
-        Args:
-            sample (`torch.Tensor`):
-                The noisy input tensor.
-            timestep (`Union[torch.Tensor, float, int]`):
-                The number of timesteps to denoise an input.
-            encoder_hidden_states (`torch.Tensor`):
-                The encoder hidden states.
-            controlnet_cond (`torch.Tensor`):
-                The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`.
-            conditioning_scale (`float`, defaults to `1.0`):
-                The scale factor for ControlNet outputs.
-            class_labels (`torch.Tensor`, *optional*, defaults to `None`):
-                Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings.
-            timestep_cond (`torch.Tensor`, *optional*, defaults to `None`):
-                Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the
-                timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep
-                embeddings.
-            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
-                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
-                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
-                negative values to the attention scores corresponding to "discard" tokens.
-            added_cond_kwargs (`dict`):
-                Additional conditions for the Stable Diffusion XL UNet.
-            cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`):
-                A kwargs dictionary that if specified is passed along to the `AttnProcessor`.
-            guess_mode (`bool`, defaults to `False`):
-                In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
-                you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
-            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.
-        Returns:
-            [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
-                If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
-                returned where the first element is the sample tensor.
-        """
-        sample_batch_size, sample_channels, sample_num_frames, sample_height, sample_width = sample.shape
-        sample = torch.zeros_like(sample)
-
-        # check channel order
-        channel_order = self.config.controlnet_conditioning_channel_order
-
-        if channel_order == "rgb":
-            # in rgb order by default
-            ...
-        elif channel_order == "bgr":
-            controlnet_cond = torch.flip(controlnet_cond, dims=[1])
-        else:
-            raise ValueError(f"unknown `controlnet_conditioning_channel_order`: {channel_order}")
-
-        # prepare attention_mask
-        if attention_mask is not None:
-            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
-            attention_mask = attention_mask.unsqueeze(1)
-
-        # 1. time
-        timesteps = timestep
-        if not torch.is_tensor(timesteps):
-            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-            # This would be a good case for the `match` statement (Python 3.10+)
-            is_mps = sample.device.type == "mps"
-            if isinstance(timestep, float):
-                dtype = torch.float32 if is_mps else torch.float64
-            else:
-                dtype = torch.int32 if is_mps else torch.int64
-            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None].to(sample.device)
-
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(sample.shape[0])
-
-        t_emb = self.time_proj(timesteps)
-
-        # timesteps does not contain any weights and will always return f32 tensors
-        # but time_embedding might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=sample.dtype)
-
-        emb = self.time_embedding(t_emb, timestep_cond)
-        emb = emb.repeat_interleave(sample_num_frames, dim=0)
-        encoder_hidden_states = encoder_hidden_states.repeat_interleave(sample_num_frames, dim=0)
-
-        # 2. pre-process
-        batch_size, channels, num_frames, height, width = sample.shape
-
-        sample = sample.permute(0, 2, 1, 3, 4).reshape(batch_size * num_frames, channels, height, width)
-        sample = self.conv_in(sample)
-
-        batch_frames, channels, height, width = sample.shape
-        sample = sample[:, None].reshape(sample_batch_size, sample_num_frames, channels, height, width)
-
-        if self.concat_conditioning_mask:
-            controlnet_cond = torch.cat([controlnet_cond, conditioning_mask], dim=1)
-
-        batch_size, channels, num_frames, height, width = controlnet_cond.shape
-        controlnet_cond = controlnet_cond.permute(0, 2, 1, 3, 4).reshape(
-            batch_size * num_frames, channels, height, width
-        )
-        controlnet_cond = self.controlnet_cond_embedding(controlnet_cond)
-        batch_frames, channels, height, width = controlnet_cond.shape
-        controlnet_cond = controlnet_cond[:, None].reshape(batch_size, num_frames, channels, height, width)
-
-        sample = sample + controlnet_cond
-
-        batch_size, num_frames, channels, height, width = sample.shape
-        sample = sample.reshape(sample_batch_size * sample_num_frames, channels, height, width)
-
-        # 3. down
-        down_block_res_samples = (sample,)
-        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
-                sample, res_samples = downsample_block(
-                    hidden_states=sample,
-                    temb=emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    num_frames=num_frames,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-            else:
-                sample, res_samples = downsample_block(hidden_states=sample, temb=emb, num_frames=num_frames)
-
-            down_block_res_samples += res_samples
-
-        # 4. mid
-        if self.mid_block is not None:
-            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
-                sample = self.mid_block(
-                    sample,
-                    emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-            else:
-                sample = self.mid_block(sample, emb)
-
-        # 5. Control net blocks
-        controlnet_down_block_res_samples = ()
-
-        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
-            down_block_res_sample = controlnet_block(down_block_res_sample)
-            controlnet_down_block_res_samples = controlnet_down_block_res_samples + (down_block_res_sample,)
-
-        down_block_res_samples = controlnet_down_block_res_samples
-        mid_block_res_sample = self.controlnet_mid_block(sample)
-
-        # 6. scaling
-        if guess_mode and not self.config.global_pool_conditions:
-            scales = torch.logspace(-1, 0, len(down_block_res_samples) + 1, device=sample.device)  # 0.1 to 1.0
-            scales = scales * conditioning_scale
-            down_block_res_samples = [sample * scale for sample, scale in zip(down_block_res_samples, scales)]
-            mid_block_res_sample = mid_block_res_sample * scales[-1]  # last one
-        else:
-            down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
-            mid_block_res_sample = mid_block_res_sample * conditioning_scale
-
-        if self.config.global_pool_conditions:
-            down_block_res_samples = [
-                torch.mean(sample, dim=(2, 3), keepdim=True) for sample in down_block_res_samples
-            ]
-            mid_block_res_sample = torch.mean(mid_block_res_sample, dim=(2, 3), keepdim=True)
-
-        if not return_dict:
-            return (down_block_res_samples, mid_block_res_sample)
-
-        return SparseControlNetOutput(
-            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
-        )
-
-
-# Copied from diffusers.models.controlnet.zero_module
-def zero_module(module: nn.Module) -> nn.Module:
-    for p in module.parameters():
-        nn.init.zeros_(p)
-    return module
@@ -352,13 +352,7 @@ def get_2d_rotary_pos_embed_lumina(embed_dim, len_h, len_w, linear_factor=1.0, n


 def get_1d_rotary_pos_embed(
-    dim: int,
-    pos: Union[np.ndarray, int],
-    theta: float = 10000.0,
-    use_real=False,
-    linear_factor=1.0,
-    ntk_factor=1.0,
-    repeat_interleave_real=True,
+    dim: int, pos: Union[np.ndarray, int], theta: float = 10000.0, use_real=False, linear_factor=1.0, ntk_factor=1.0
 ):
    """
    Precompute the frequency tensor for complex exponentials (cis) with given dimensions.
@@ -378,9 +372,6 @@ def get_1d_rotary_pos_embed(
            Scaling factor for the context extrapolation. Defaults to 1.0.
        ntk_factor (`float`, *optional*, defaults to 1.0):
            Scaling factor for the NTK-Aware RoPE. Defaults to 1.0.
-        repeat_interleave_real (`bool`, *optional*, defaults to `True`):
-            If `True` and `use_real`, real part and imaginary part are each interleaved with themselves to reach `dim`.
-            Otherwise, they are concateanted with themselves.
    Returns:
        `torch.Tensor`: Precomputed frequency tensor with complex exponentials. [S, D/2]
    """
@@ -392,14 +383,10 @@ def get_1d_rotary_pos_embed(
    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim)) / linear_factor  # [D/2]
    t = torch.from_numpy(pos).to(freqs.device)  # type: ignore  # [S]
    freqs = torch.outer(t, freqs).float()  # type: ignore   # [S, D/2]
-    if use_real and repeat_interleave_real:
+    if use_real:
        freqs_cos = freqs.cos().repeat_interleave(2, dim=1)  # [S, D]
        freqs_sin = freqs.sin().repeat_interleave(2, dim=1)  # [S, D]
        return freqs_cos, freqs_sin
-    elif use_real:
-        freqs_cos = torch.cat([freqs.cos(), freqs.cos()], dim=-1)  # [S, D]
-        freqs_sin = torch.cat([freqs.sin(), freqs.sin()], dim=-1)  # [S, D]
-        return freqs_cos, freqs_sin
    else:
        freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64     # [S, D/2]
        return freqs_cis
@@ -409,7 +396,6 @@ def apply_rotary_emb(
    x: torch.Tensor,
    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
    use_real: bool = True,
-    use_real_unbind_dim: int = -1,
 ) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
@@ -431,17 +417,8 @@ def apply_rotary_emb(
        sin = sin[None, None]
        cos, sin = cos.to(x.device), sin.to(x.device)

-        if use_real_unbind_dim == -1:
-            # Use for example in Lumina
-            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
-            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
-        elif use_real_unbind_dim == -2:
-            # Use for example in Stable Audio
-            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
-            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
-        else:
-            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
-
+        x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+        x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)

        return out
@@ -10,7 +10,6 @@ if is_torch_available():
    from .lumina_nextdit2d import LuminaNextDiT2DModel
    from .pixart_transformer_2d import PixArtTransformer2DModel
    from .prior_transformer import PriorTransformer
-    from .stable_audio_transformer import StableAudioDiTModel
    from .t5_film_transformer import T5FilmDecoder
    from .transformer_2d import Transformer2DModel
    from .transformer_sd3 import SD3Transformer2DModel
@@ -138,14 +138,14 @@ class AuraFlowSingleTransformerBlock(nn.Module):
        self.norm2 = FP32LayerNorm(dim, elementwise_affine=False, bias=False)
        self.ff = AuraFlowFeedForward(dim, dim * 4)

-    def forward(self, hidden_states: torch.FloatTensor, temb: torch.FloatTensor):
+    def forward(self, hidden_states: torch.FloatTensor, temb: torch.FloatTensor, i=9999):
        residual = hidden_states

        # Norm + Projection.
        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)

        # Attention.
-        attn_output = self.attn(hidden_states=norm_hidden_states)
+        attn_output = self.attn(hidden_states=norm_hidden_states, i=i)

        # Process attention outputs for the `hidden_states`.
        hidden_states = self.norm2(residual + gate_msa.unsqueeze(1) * attn_output)
@@ -201,7 +201,7 @@ class AuraFlowJointTransformerBlock(nn.Module):
        self.ff_context = AuraFlowFeedForward(dim, dim * 4)

    def forward(
-        self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor
+        self, hidden_states: torch.FloatTensor, encoder_hidden_states: torch.FloatTensor, temb: torch.FloatTensor, i=0
    ):
        residual = hidden_states
        residual_context = encoder_hidden_states
@@ -214,7 +214,7 @@ class AuraFlowJointTransformerBlock(nn.Module):

        # Attention.
        attn_output, context_attn_output = self.attn(
-            hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states
+            hidden_states=norm_hidden_states, encoder_hidden_states=norm_encoder_hidden_states, i=i
        )

        # Process attention outputs for the `hidden_states`.
@@ -366,7 +366,7 @@ class AuraFlowTransformer2DModel(ModelMixin, ConfigMixin):

            else:
                encoder_hidden_states, hidden_states = block(
-                    hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states, temb=temb
+                    hidden_states=hidden_states, encoder_hidden_states=encoder_hidden_states, temb=temb, i=index_block
                )

        # Single DiT blocks that combine the `hidden_states` (image) and `encoder_hidden_states` (text)
@@ -1,458 +0,0 @@
-# Copyright 2024 Stability AI and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-from typing import Any, Dict, Optional, Union
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.utils.checkpoint
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...models.attention import FeedForward
-from ...models.attention_processor import (
-    Attention,
-    AttentionProcessor,
-    StableAudioAttnProcessor2_0,
-)
-from ...models.modeling_utils import ModelMixin
-from ...models.transformers.transformer_2d import Transformer2DModelOutput
-from ...utils import is_torch_version, logging
-from ...utils.torch_utils import maybe_allow_in_graph
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class StableAudioGaussianFourierProjection(nn.Module):
-    """Gaussian Fourier embeddings for noise levels."""
-
-    # Copied from diffusers.models.embeddings.GaussianFourierProjection.__init__
-    def __init__(
-        self, embedding_size: int = 256, scale: float = 1.0, set_W_to_weight=True, log=True, flip_sin_to_cos=False
-    ):
-        super().__init__()
-        self.weight = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
-        self.log = log
-        self.flip_sin_to_cos = flip_sin_to_cos
-
-        if set_W_to_weight:
-            # to delete later
-            del self.weight
-            self.W = nn.Parameter(torch.randn(embedding_size) * scale, requires_grad=False)
-            self.weight = self.W
-            del self.W
-
-    def forward(self, x):
-        if self.log:
-            x = torch.log(x)
-
-        x_proj = 2 * np.pi * x[:, None] @ self.weight[None, :]
-
-        if self.flip_sin_to_cos:
-            out = torch.cat([torch.cos(x_proj), torch.sin(x_proj)], dim=-1)
-        else:
-            out = torch.cat([torch.sin(x_proj), torch.cos(x_proj)], dim=-1)
-        return out
-
-
-@maybe_allow_in_graph
-class StableAudioDiTBlock(nn.Module):
-    r"""
-    Transformer block used in Stable Audio model (https://github.com/Stability-AI/stable-audio-tools). Allow skip
-    connection and QKNorm
-
-    Parameters:
-        dim (`int`): The number of channels in the input and output.
-        num_attention_heads (`int`): The number of heads to use for the query states.
-        num_key_value_attention_heads (`int`): The number of heads to use for the key and value states.
-        attention_head_dim (`int`): The number of channels in each head.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
-        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
-        upcast_attention (`bool`, *optional*):
-            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
-    """
-
-    def __init__(
-        self,
-        dim: int,
-        num_attention_heads: int,
-        num_key_value_attention_heads: int,
-        attention_head_dim: int,
-        dropout=0.0,
-        cross_attention_dim: Optional[int] = None,
-        upcast_attention: bool = False,
-        norm_eps: float = 1e-5,
-        ff_inner_dim: Optional[int] = None,
-    ):
-        super().__init__()
-        # Define 3 blocks. Each block has its own normalization layer.
-        # 1. Self-Attn
-        self.norm1 = nn.LayerNorm(dim, elementwise_affine=True, eps=norm_eps)
-        self.attn1 = Attention(
-            query_dim=dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            dropout=dropout,
-            bias=False,
-            upcast_attention=upcast_attention,
-            out_bias=False,
-            processor=StableAudioAttnProcessor2_0(),
-        )
-
-        # 2. Cross-Attn
-        self.norm2 = nn.LayerNorm(dim, norm_eps, True)
-
-        self.attn2 = Attention(
-            query_dim=dim,
-            cross_attention_dim=cross_attention_dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            kv_heads=num_key_value_attention_heads,
-            dropout=dropout,
-            bias=False,
-            upcast_attention=upcast_attention,
-            out_bias=False,
-            processor=StableAudioAttnProcessor2_0(),
-        )  # is self-attn if encoder_hidden_states is none
-
-        # 3. Feed-forward
-        self.norm3 = nn.LayerNorm(dim, norm_eps, True)
-        self.ff = FeedForward(
-            dim,
-            dropout=dropout,
-            activation_fn="swiglu",
-            final_dropout=False,
-            inner_dim=ff_inner_dim,
-            bias=True,
-        )
-
-        # let chunk size default to None
-        self._chunk_size = None
-        self._chunk_dim = 0
-
-    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
-        # Sets chunk feed-forward
-        self._chunk_size = chunk_size
-        self._chunk_dim = dim
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        rotary_embedding: Optional[torch.FloatTensor] = None,
-    ) -> torch.Tensor:
-        # Notice that normalization is always applied before the real computation in the following blocks.
-        # 0. Self-Attention
-        norm_hidden_states = self.norm1(hidden_states)
-
-        attn_output = self.attn1(
-            norm_hidden_states,
-            attention_mask=attention_mask,
-            rotary_emb=rotary_embedding,
-        )
-
-        hidden_states = attn_output + hidden_states
-
-        # 2. Cross-Attention
-        norm_hidden_states = self.norm2(hidden_states)
-
-        attn_output = self.attn2(
-            norm_hidden_states,
-            encoder_hidden_states=encoder_hidden_states,
-            attention_mask=encoder_attention_mask,
-        )
-        hidden_states = attn_output + hidden_states
-
-        # 3. Feed-forward
-        norm_hidden_states = self.norm3(hidden_states)
-        ff_output = self.ff(norm_hidden_states)
-
-        hidden_states = ff_output + hidden_states
-
-        return hidden_states
-
-
-class StableAudioDiTModel(ModelMixin, ConfigMixin):
-    """
-    The Diffusion Transformer model introduced in Stable Audio.
-
-    Reference: https://github.com/Stability-AI/stable-audio-tools
-
-    Parameters:
-        sample_size ( `int`, *optional*, defaults to 1024): The size of the input sample.
-        in_channels (`int`, *optional*, defaults to 64): The number of channels in the input.
-        num_layers (`int`, *optional*, defaults to 24): The number of layers of Transformer blocks to use.
-        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
-        num_attention_heads (`int`, *optional*, defaults to 24): The number of heads to use for the query states.
-        num_key_value_attention_heads (`int`, *optional*, defaults to 12):
-            The number of heads to use for the key and value states.
-        out_channels (`int`, defaults to 64): Number of output channels.
-        cross_attention_dim ( `int`, *optional*, defaults to 768): Dimension of the cross-attention projection.
-        time_proj_dim ( `int`, *optional*, defaults to 256): Dimension of the timestep inner projection.
-        global_states_input_dim ( `int`, *optional*, defaults to 1536):
-            Input dimension of the global hidden states projection.
-        cross_attention_input_dim ( `int`, *optional*, defaults to 768):
-            Input dimension of the cross-attention projection
-    """
-
-    _supports_gradient_checkpointing = True
-
-    @register_to_config
-    def __init__(
-        self,
-        sample_size: int = 1024,
-        in_channels: int = 64,
-        num_layers: int = 24,
-        attention_head_dim: int = 64,
-        num_attention_heads: int = 24,
-        num_key_value_attention_heads: int = 12,
-        out_channels: int = 64,
-        cross_attention_dim: int = 768,
-        time_proj_dim: int = 256,
-        global_states_input_dim: int = 1536,
-        cross_attention_input_dim: int = 768,
-    ):
-        super().__init__()
-        self.sample_size = sample_size
-        self.out_channels = out_channels
-        self.inner_dim = num_attention_heads * attention_head_dim
-
-        self.time_proj = StableAudioGaussianFourierProjection(
-            embedding_size=time_proj_dim // 2,
-            flip_sin_to_cos=True,
-            log=False,
-            set_W_to_weight=False,
-        )
-
-        self.timestep_proj = nn.Sequential(
-            nn.Linear(time_proj_dim, self.inner_dim, bias=True),
-            nn.SiLU(),
-            nn.Linear(self.inner_dim, self.inner_dim, bias=True),
-        )
-
-        self.global_proj = nn.Sequential(
-            nn.Linear(global_states_input_dim, self.inner_dim, bias=False),
-            nn.SiLU(),
-            nn.Linear(self.inner_dim, self.inner_dim, bias=False),
-        )
-
-        self.cross_attention_proj = nn.Sequential(
-            nn.Linear(cross_attention_input_dim, cross_attention_dim, bias=False),
-            nn.SiLU(),
-            nn.Linear(cross_attention_dim, cross_attention_dim, bias=False),
-        )
-
-        self.preprocess_conv = nn.Conv1d(in_channels, in_channels, 1, bias=False)
-        self.proj_in = nn.Linear(in_channels, self.inner_dim, bias=False)
-
-        self.transformer_blocks = nn.ModuleList(
-            [
-                StableAudioDiTBlock(
-                    dim=self.inner_dim,
-                    num_attention_heads=num_attention_heads,
-                    num_key_value_attention_heads=num_key_value_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                    cross_attention_dim=cross_attention_dim,
-                )
-                for i in range(num_layers)
-            ]
-        )
-
-        self.proj_out = nn.Linear(self.inner_dim, self.out_channels, bias=False)
-        self.postprocess_conv = nn.Conv1d(self.out_channels, self.out_channels, 1, bias=False)
-
-        self.gradient_checkpointing = False
-
-    @property
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
-    def attn_processors(self) -> Dict[str, AttentionProcessor]:
-        r"""
-        Returns:
-            `dict` of attention processors: A dictionary containing all attention processors used in the model with
-            indexed by its weight name.
-        """
-        # set recursively
-        processors = {}
-
-        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
-            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor()
-
-            for sub_name, child in module.named_children():
-                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
-
-            return processors
-
-        for name, module in self.named_children():
-            fn_recursive_add_processors(name, module, processors)
-
-        return processors
-
-    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
-        r"""
-        Sets the attention processor to use to compute attention.
-
-        Parameters:
-            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
-                The instantiated processor class or a dictionary of processor classes that will be set as the processor
-                for **all** `Attention` layers.
-
-                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
-                processor. This is strongly recommended when setting trainable attention processors.
-
-        """
-        count = len(self.attn_processors.keys())
-
-        if isinstance(processor, dict) and len(processor) != count:
-            raise ValueError(
-                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
-                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
-            )
-
-        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
-            if hasattr(module, "set_processor"):
-                if not isinstance(processor, dict):
-                    module.set_processor(processor)
-                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
-
-            for sub_name, child in module.named_children():
-                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
-
-        for name, module in self.named_children():
-            fn_recursive_attn_processor(name, module, processor)
-
-    # Copied from diffusers.models.transformers.hunyuan_transformer_2d.HunyuanDiT2DModel.set_default_attn_processor with Hunyuan->StableAudio
-    def set_default_attn_processor(self):
-        """
-        Disables custom attention processors and sets the default attention implementation.
-        """
-        self.set_attn_processor(StableAudioAttnProcessor2_0())
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if hasattr(module, "gradient_checkpointing"):
-            module.gradient_checkpointing = value
-
-    def forward(
-        self,
-        hidden_states: torch.FloatTensor,
-        timestep: torch.LongTensor = None,
-        encoder_hidden_states: torch.FloatTensor = None,
-        global_hidden_states: torch.FloatTensor = None,
-        rotary_embedding: torch.FloatTensor = None,
-        return_dict: bool = True,
-        attention_mask: Optional[torch.LongTensor] = None,
-        encoder_attention_mask: Optional[torch.LongTensor] = None,
-    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
-        """
-        The [`StableAudioDiTModel`] forward method.
-
-        Args:
-            hidden_states (`torch.FloatTensor` of shape `(batch size, in_channels, sequence_len)`):
-                Input `hidden_states`.
-            timestep ( `torch.LongTensor`):
-                Used to indicate denoising step.
-            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, encoder_sequence_len, cross_attention_input_dim)`):
-                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
-            global_hidden_states (`torch.FloatTensor` of shape `(batch size, global_sequence_len, global_states_input_dim)`):
-               Global embeddings that will be prepended to the hidden states.
-            rotary_embedding (`torch.Tensor`):
-                The rotary embeddings to apply on query and key tensors during attention calculation.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
-                tuple.
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_len)`, *optional*):
-                Mask to avoid performing attention on padding token indices, formed by concatenating the attention
-                masks
-                    for the two text encoders together. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-            encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_len)`, *optional*):
-                Mask to avoid performing attention on padding token cross-attention indices, formed by concatenating
-                the attention masks
-                    for the two text encoders together. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-        Returns:
-            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
-            `tuple` where the first element is the sample tensor.
-        """
-        cross_attention_hidden_states = self.cross_attention_proj(encoder_hidden_states)
-        global_hidden_states = self.global_proj(global_hidden_states)
-        time_hidden_states = self.timestep_proj(self.time_proj(timestep.to(self.dtype)))
-
-        global_hidden_states = global_hidden_states + time_hidden_states.unsqueeze(1)
-
-        hidden_states = self.preprocess_conv(hidden_states) + hidden_states
-        # (batch_size, dim, sequence_length) -> (batch_size, sequence_length, dim)
-        hidden_states = hidden_states.transpose(1, 2)
-
-        hidden_states = self.proj_in(hidden_states)
-
-        # prepend global states to hidden states
-        hidden_states = torch.cat([global_hidden_states, hidden_states], dim=-2)
-        if attention_mask is not None:
-            prepend_mask = torch.ones((hidden_states.shape[0], 1), device=hidden_states.device, dtype=torch.bool)
-            attention_mask = torch.cat([prepend_mask, attention_mask], dim=-1)
-
-        for block in self.transformer_blocks:
-            if self.training and self.gradient_checkpointing:
-
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)
-                        else:
-                            return module(*inputs)
-
-                    return custom_forward
-
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    attention_mask,
-                    cross_attention_hidden_states,
-                    encoder_attention_mask,
-                    rotary_embedding,
-                    **ckpt_kwargs,
-                )
-
-            else:
-                hidden_states = block(
-                    hidden_states=hidden_states,
-                    attention_mask=attention_mask,
-                    encoder_hidden_states=cross_attention_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
-                    rotary_embedding=rotary_embedding,
-                )
-
-        hidden_states = self.proj_out(hidden_states)
-
-        # (batch_size, sequence_length, dim) -> (batch_size, dim, sequence_length)
-        # remove prepend length that has been added by global hidden states
-        hidden_states = hidden_states.transpose(1, 2)[:, :, 1:]
-        hidden_states = self.postprocess_conv(hidden_states) + hidden_states
-
-        if not return_dict:
-            return (hidden_states,)
-
-        return Transformer2DModelOutput(sample=hidden_states)
@@ -1027,10 +1027,6 @@ class UNet2DConditionModel(
                raise ValueError(
                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
                )
-
-            if hasattr(self, "text_encoder_hid_proj") and self.text_encoder_hid_proj is not None:
-                encoder_hidden_states = self.text_encoder_hid_proj(encoder_hidden_states)
-
            image_embeds = added_cond_kwargs.get("image_embeds")
            image_embeds = self.encoder_hid_proj(image_embeds)
            encoder_hidden_states = (encoder_hidden_states, image_embeds)
@@ -966,7 +966,6 @@ class DownBlockMotion(nn.Module):
        temporal_num_attention_heads: Union[int, Tuple[int]] = 1,
        temporal_cross_attention_dim: Optional[int] = None,
        temporal_max_seq_length: int = 32,
-        temporal_double_self_attention: bool = True,
        temporal_transformer_layers_per_block: Union[int, Tuple[int]] = 1,
    ):
        super().__init__()
@@ -1017,7 +1016,6 @@ class DownBlockMotion(nn.Module):
                    positional_embeddings="sinusoidal",
                    num_positional_embeddings=temporal_max_seq_length,
                    attention_head_dim=out_channels // temporal_num_attention_heads[i],
-                    double_self_attention=temporal_double_self_attention,
                )
            )

@@ -1120,7 +1118,6 @@ class CrossAttnDownBlockMotion(nn.Module):
        temporal_num_attention_heads: int = 8,
        temporal_max_seq_length: int = 32,
        temporal_transformer_layers_per_block: Union[int, Tuple[int]] = 1,
-        temporal_double_self_attention: bool = True,
    ):
        super().__init__()
        resnets = []
@@ -1202,7 +1199,6 @@ class CrossAttnDownBlockMotion(nn.Module):
                    positional_embeddings="sinusoidal",
                    num_positional_embeddings=temporal_max_seq_length,
                    attention_head_dim=out_channels // temporal_num_attention_heads,
-                    double_self_attention=temporal_double_self_attention,
                )
            )

@@ -19,7 +19,7 @@ import torch.nn.functional as F
 import torch.utils.checkpoint

 from ...configuration_utils import ConfigMixin, FrozenDict, register_to_config
-from ...loaders import FromOriginalModelMixin, PeftAdapterMixin, UNet2DConditionLoadersMixin
+from ...loaders import FromOriginalModelMixin, UNet2DConditionLoadersMixin
 from ...utils import logging
 from ..attention_processor import (
    ADDED_KV_ATTENTION_PROCESSORS,
@@ -231,7 +231,7 @@ class MotionAdapter(ModelMixin, ConfigMixin, FromOriginalModelMixin):
        pass


-class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin, PeftAdapterMixin):
+class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
    r"""
    A modified conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a
    sample shaped output.
@@ -118,9 +118,7 @@ else:
    _import_structure["amused"] = ["AmusedImg2ImgPipeline", "AmusedInpaintPipeline", "AmusedPipeline"]
    _import_structure["animatediff"] = [
        "AnimateDiffPipeline",
-        "AnimateDiffControlNetPipeline",
        "AnimateDiffSDXLPipeline",
-        "AnimateDiffSparseControlNetPipeline",
        "AnimateDiffVideoToVideoPipeline",
    ]
    _import_structure["audioldm"] = ["AudioLDMPipeline"]
@@ -232,10 +230,6 @@ else:
    _import_structure["pixart_alpha"] = ["PixArtAlphaPipeline", "PixArtSigmaPipeline"]
    _import_structure["semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"]
    _import_structure["shap_e"] = ["ShapEImg2ImgPipeline", "ShapEPipeline"]
-    _import_structure["stable_audio"] = [
-        "StableAudioProjectionModel",
-        "StableAudioPipeline",
-    ]
    _import_structure["stable_cascade"] = [
        "StableCascadeCombinedPipeline",
        "StableCascadeDecoderPipeline",
@@ -419,13 +413,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from ..utils.dummy_torch_and_transformers_objects import *
    else:
        from .amused import AmusedImg2ImgPipeline, AmusedInpaintPipeline, AmusedPipeline
-        from .animatediff import (
-            AnimateDiffControlNetPipeline,
-            AnimateDiffPipeline,
-            AnimateDiffSDXLPipeline,
-            AnimateDiffSparseControlNetPipeline,
-            AnimateDiffVideoToVideoPipeline,
-        )
+        from .animatediff import AnimateDiffPipeline, AnimateDiffSDXLPipeline, AnimateDiffVideoToVideoPipeline
        from .audioldm import AudioLDMPipeline
        from .audioldm2 import (
            AudioLDM2Pipeline,
@@ -539,7 +527,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .pixart_alpha import PixArtAlphaPipeline, PixArtSigmaPipeline
        from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
        from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
-        from .stable_audio import StableAudioPipeline, StableAudioProjectionModel
        from .stable_cascade import (
            StableCascadeCombinedPipeline,
            StableCascadeDecoderPipeline,
@@ -22,9 +22,7 @@ except OptionalDependencyNotAvailable:
    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
    _import_structure["pipeline_animatediff"] = ["AnimateDiffPipeline"]
-    _import_structure["pipeline_animatediff_controlnet"] = ["AnimateDiffControlNetPipeline"]
    _import_structure["pipeline_animatediff_sdxl"] = ["AnimateDiffSDXLPipeline"]
-    _import_structure["pipeline_animatediff_sparsectrl"] = ["AnimateDiffSparseControlNetPipeline"]
    _import_structure["pipeline_animatediff_video2video"] = ["AnimateDiffVideoToVideoPipeline"]

 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -36,9 +34,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:

    else:
        from .pipeline_animatediff import AnimateDiffPipeline
-        from .pipeline_animatediff_controlnet import AnimateDiffControlNetPipeline
        from .pipeline_animatediff_sdxl import AnimateDiffSDXLPipeline
-        from .pipeline_animatediff_sparsectrl import AnimateDiffSparseControlNetPipeline
        from .pipeline_animatediff_video2video import AnimateDiffVideoToVideoPipeline
        from .pipeline_output import AnimateDiffPipelineOutput

@@ -180,8 +180,6 @@ class FreeInitMixin:
            num_inference_steps = max(
                1, int(num_inference_steps / self._free_init_num_iters * (free_init_iteration + 1))
            )
-
-        if num_inference_steps > 0:
            self.scheduler.set_timesteps(num_inference_steps, device=device)

        return latents, self.scheduler.timesteps
@@ -15,12 +15,11 @@ import inspect
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union

 import torch
-from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection

 from ...callbacks import MultiPipelineCallbacks, PipelineCallback
-from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import IPAdapterMixin, StableDiffusionXLLoraLoaderMixin
-from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...image_processor import VaeImageProcessor
+from ...loaders import StableDiffusionXLLoraLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import AttnProcessor2_0, FusedAttnProcessor2_0, XFormersAttnProcessor
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import is_torch_xla_available, logging, replace_example_docstring
@@ -121,7 +120,7 @@ def retrieve_timesteps(
    return timesteps, num_inference_steps


-class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin, IPAdapterMixin):
+class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin):
    r"""
    Pipeline for text-to-image generation using Kolors.

@@ -131,7 +130,6 @@ class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLL
    The pipeline also inherits the following loading methods:
        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
-        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
@@ -150,11 +148,7 @@ class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLL
            `Kwai-Kolors/Kolors-diffusers`.
    """

-    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
-    _optional_components = [
-        "image_encoder",
-        "feature_extractor",
-    ]
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _callback_tensor_inputs = [
        "latents",
        "prompt_embeds",
@@ -172,21 +166,11 @@ class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLL
        tokenizer: ChatGLMTokenizer,
        unet: UNet2DConditionModel,
        scheduler: KarrasDiffusionSchedulers,
-        image_encoder: CLIPVisionModelWithProjection = None,
-        feature_extractor: CLIPImageProcessor = None,
        force_zeros_for_empty_prompt: bool = False,
    ):
        super().__init__()

-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            image_encoder=image_encoder,
-            feature_extractor=feature_extractor,
-        )
+        self.register_modules(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
        self.vae_scale_factor = (
            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
@@ -359,77 +343,6 @@ class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLL

        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
-    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
-        dtype = next(self.image_encoder.parameters()).dtype
-
-        if not isinstance(image, torch.Tensor):
-            image = self.feature_extractor(image, return_tensors="pt").pixel_values
-
-        image = image.to(device=device, dtype=dtype)
-        if output_hidden_states:
-            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
-            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_enc_hidden_states = self.image_encoder(
-                torch.zeros_like(image), output_hidden_states=True
-            ).hidden_states[-2]
-            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
-                num_images_per_prompt, dim=0
-            )
-            return image_enc_hidden_states, uncond_image_enc_hidden_states
-        else:
-            image_embeds = self.image_encoder(image).image_embeds
-            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_embeds = torch.zeros_like(image_embeds)
-
-            return image_embeds, uncond_image_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
-    def prepare_ip_adapter_image_embeds(
-        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
-    ):
-        image_embeds = []
-        if do_classifier_free_guidance:
-            negative_image_embeds = []
-        if ip_adapter_image_embeds is None:
-            if not isinstance(ip_adapter_image, list):
-                ip_adapter_image = [ip_adapter_image]
-
-            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
-                raise ValueError(
-                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
-                )
-
-            for single_ip_adapter_image, image_proj_layer in zip(
-                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
-            ):
-                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
-                single_image_embeds, single_negative_image_embeds = self.encode_image(
-                    single_ip_adapter_image, device, 1, output_hidden_state
-                )
-
-                image_embeds.append(single_image_embeds[None, :])
-                if do_classifier_free_guidance:
-                    negative_image_embeds.append(single_negative_image_embeds[None, :])
-        else:
-            for single_image_embeds in ip_adapter_image_embeds:
-                if do_classifier_free_guidance:
-                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    negative_image_embeds.append(single_negative_image_embeds)
-                image_embeds.append(single_image_embeds)
-
-        ip_adapter_image_embeds = []
-        for i, single_image_embeds in enumerate(image_embeds):
-            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
-            if do_classifier_free_guidance:
-                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
-                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
-
-            single_image_embeds = single_image_embeds.to(device=device)
-            ip_adapter_image_embeds.append(single_image_embeds)
-
-        return ip_adapter_image_embeds
-
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
    def prepare_extra_step_kwargs(self, generator, eta):
        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -451,7 +364,6 @@ class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLL
    def check_inputs(
        self,
        prompt,
-        num_inference_steps,
        height,
        width,
        negative_prompt=None,
@@ -459,17 +371,9 @@ class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLL
        pooled_prompt_embeds=None,
        negative_prompt_embeds=None,
        negative_pooled_prompt_embeds=None,
-        ip_adapter_image=None,
-        ip_adapter_image_embeds=None,
        callback_on_step_end_tensor_inputs=None,
        max_sequence_length=None,
    ):
-        if not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
-            raise ValueError(
-                f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
-                f" {type(num_inference_steps)}."
-            )
-
        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

@@ -516,21 +420,6 @@ class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLL
                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
            )

-        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
-            raise ValueError(
-                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
-            )
-
-        if ip_adapter_image_embeds is not None:
-            if not isinstance(ip_adapter_image_embeds, list):
-                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
-                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
-                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
-                )
-
        if max_sequence_length is not None and max_sequence_length > 256:
            raise ValueError(f"`max_sequence_length` cannot be greater than 256 but is {max_sequence_length}")

@@ -674,8 +563,6 @@ class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLL
        pooled_prompt_embeds: Optional[torch.Tensor] = None,
        negative_prompt_embeds: Optional[torch.Tensor] = None,
        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
-        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -762,12 +649,6 @@ class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLL
                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
                input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
-            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
-                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
-                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
-                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -838,7 +719,6 @@ class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLL
        # 1. Check inputs. Raise error if not correct
        self.check_inputs(
            prompt,
-            num_inference_steps,
            height,
            width,
            negative_prompt,
@@ -846,8 +726,6 @@ class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLL
            pooled_prompt_embeds,
            negative_prompt_embeds,
            negative_pooled_prompt_embeds,
-            ip_adapter_image,
-            ip_adapter_image_embeds,
            callback_on_step_end_tensor_inputs,
            max_sequence_length=max_sequence_length,
        )
@@ -937,15 +815,6 @@ class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLL
        add_text_embeds = add_text_embeds.to(device)
        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)

-        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
-            image_embeds = self.prepare_ip_adapter_image_embeds(
-                ip_adapter_image,
-                ip_adapter_image_embeds,
-                device,
-                batch_size * num_images_per_prompt,
-                self.do_classifier_free_guidance,
-            )
-
        # 8. Denoising loop
        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)

@@ -987,9 +856,6 @@ class KolorsPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLL
                # predict the noise residual
                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}

-                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
-                    added_cond_kwargs["image_embeds"] = image_embeds
-
                noise_pred = self.unet(
                    latent_model_input,
                    t,
@@ -16,12 +16,11 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union

 import PIL.Image
 import torch
-from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection

 from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import IPAdapterMixin, StableDiffusionXLLoraLoaderMixin
-from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...loaders import StableDiffusionXLLoraLoaderMixin
+from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.attention_processor import AttnProcessor2_0, FusedAttnProcessor2_0, XFormersAttnProcessor
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import is_torch_xla_available, logging, replace_example_docstring
@@ -140,7 +139,7 @@ def retrieve_timesteps(
    return timesteps, num_inference_steps


-class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin, IPAdapterMixin):
+class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffusionXLLoraLoaderMixin):
    r"""
    Pipeline for text-to-image generation using Kolors.

@@ -150,7 +149,6 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
    The pipeline also inherits the following loading methods:
        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
-        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
@@ -169,10 +167,10 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
            `Kwai-Kolors/Kolors-diffusers`.
    """

-    model_cpu_offload_seq = "text_encoder->image_encoder-unet->vae"
+    model_cpu_offload_seq = "text_encoder->unet->vae"
    _optional_components = [
-        "image_encoder",
-        "feature_extractor",
+        "tokenizer",
+        "text_encoder",
    ]
    _callback_tensor_inputs = [
        "latents",
@@ -191,21 +189,11 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
        tokenizer: ChatGLMTokenizer,
        unet: UNet2DConditionModel,
        scheduler: KarrasDiffusionSchedulers,
-        image_encoder: CLIPVisionModelWithProjection = None,
-        feature_extractor: CLIPImageProcessor = None,
        force_zeros_for_empty_prompt: bool = False,
    ):
        super().__init__()

-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            image_encoder=image_encoder,
-            feature_extractor=feature_extractor,
-        )
+        self.register_modules(vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet, scheduler=scheduler)
        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
        self.vae_scale_factor = (
            2 ** (len(self.vae.config.block_out_channels) - 1) if hasattr(self, "vae") and self.vae is not None else 8
@@ -379,77 +367,6 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu

        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
-    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
-        dtype = next(self.image_encoder.parameters()).dtype
-
-        if not isinstance(image, torch.Tensor):
-            image = self.feature_extractor(image, return_tensors="pt").pixel_values
-
-        image = image.to(device=device, dtype=dtype)
-        if output_hidden_states:
-            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
-            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_enc_hidden_states = self.image_encoder(
-                torch.zeros_like(image), output_hidden_states=True
-            ).hidden_states[-2]
-            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
-                num_images_per_prompt, dim=0
-            )
-            return image_enc_hidden_states, uncond_image_enc_hidden_states
-        else:
-            image_embeds = self.image_encoder(image).image_embeds
-            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_embeds = torch.zeros_like(image_embeds)
-
-            return image_embeds, uncond_image_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
-    def prepare_ip_adapter_image_embeds(
-        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance
-    ):
-        image_embeds = []
-        if do_classifier_free_guidance:
-            negative_image_embeds = []
-        if ip_adapter_image_embeds is None:
-            if not isinstance(ip_adapter_image, list):
-                ip_adapter_image = [ip_adapter_image]
-
-            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
-                raise ValueError(
-                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
-                )
-
-            for single_ip_adapter_image, image_proj_layer in zip(
-                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
-            ):
-                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
-                single_image_embeds, single_negative_image_embeds = self.encode_image(
-                    single_ip_adapter_image, device, 1, output_hidden_state
-                )
-
-                image_embeds.append(single_image_embeds[None, :])
-                if do_classifier_free_guidance:
-                    negative_image_embeds.append(single_negative_image_embeds[None, :])
-        else:
-            for single_image_embeds in ip_adapter_image_embeds:
-                if do_classifier_free_guidance:
-                    single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2)
-                    negative_image_embeds.append(single_negative_image_embeds)
-                image_embeds.append(single_image_embeds)
-
-        ip_adapter_image_embeds = []
-        for i, single_image_embeds in enumerate(image_embeds):
-            single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
-            if do_classifier_free_guidance:
-                single_negative_image_embeds = torch.cat([negative_image_embeds[i]] * num_images_per_prompt, dim=0)
-                single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds], dim=0)
-
-            single_image_embeds = single_image_embeds.to(device=device)
-            ip_adapter_image_embeds.append(single_image_embeds)
-
-        return ip_adapter_image_embeds
-
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
    def prepare_extra_step_kwargs(self, generator, eta):
        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -472,7 +389,6 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
        self,
        prompt,
        strength,
-        num_inference_steps,
        height,
        width,
        negative_prompt=None,
@@ -480,20 +396,12 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
        pooled_prompt_embeds=None,
        negative_prompt_embeds=None,
        negative_pooled_prompt_embeds=None,
-        ip_adapter_image=None,
-        ip_adapter_image_embeds=None,
        callback_on_step_end_tensor_inputs=None,
        max_sequence_length=None,
    ):
        if strength < 0 or strength > 1:
            raise ValueError(f"The value of strength should in [0.0, 1.0] but is {strength}")

-        if not isinstance(num_inference_steps, int) or num_inference_steps <= 0:
-            raise ValueError(
-                f"`num_inference_steps` has to be a positive integer but is {num_inference_steps} of type"
-                f" {type(num_inference_steps)}."
-            )
-
        if height % 8 != 0 or width % 8 != 0:
            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")

@@ -540,21 +448,6 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
                "If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
            )

-        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
-            raise ValueError(
-                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
-            )
-
-        if ip_adapter_image_embeds is not None:
-            if not isinstance(ip_adapter_image_embeds, list):
-                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}"
-                )
-            elif ip_adapter_image_embeds[0].ndim not in [3, 4]:
-                raise ValueError(
-                    f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D"
-                )
-
        if max_sequence_length is not None and max_sequence_length > 256:
            raise ValueError(f"`max_sequence_length` cannot be greater than 256 but is {max_sequence_length}")

@@ -806,8 +699,6 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
        pooled_prompt_embeds: Optional[torch.Tensor] = None,
        negative_prompt_embeds: Optional[torch.Tensor] = None,
        negative_pooled_prompt_embeds: Optional[torch.Tensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
-        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -910,12 +801,6 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
                input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
-            ip_adapter_image_embeds (`List[torch.Tensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of
-                IP-adapters. Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should
-                contain the negative image embedding if `do_classifier_free_guidance` is set to `True`. If not
-                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generate image. Choose between
                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
@@ -987,7 +872,6 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
        self.check_inputs(
            prompt,
            strength,
-            num_inference_steps,
            height,
            width,
            negative_prompt,
@@ -995,8 +879,6 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
            pooled_prompt_embeds,
            negative_prompt_embeds,
            negative_pooled_prompt_embeds,
-            ip_adapter_image,
-            ip_adapter_image_embeds,
            callback_on_step_end_tensor_inputs,
            max_sequence_length=max_sequence_length,
        )
@@ -1108,15 +990,6 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
        add_text_embeds = add_text_embeds.to(device)
        add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)

-        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
-            image_embeds = self.prepare_ip_adapter_image_embeds(
-                ip_adapter_image,
-                ip_adapter_image_embeds,
-                device,
-                batch_size * num_images_per_prompt,
-                self.do_classifier_free_guidance,
-            )
-
        # 9. Denoising loop
        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)

@@ -1164,9 +1037,6 @@ class KolorsImg2ImgPipeline(DiffusionPipeline, StableDiffusionMixin, StableDiffu
                # predict the noise residual
                added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}

-                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
-                    added_cond_kwargs["image_embeds"] = image_embeds
-
                noise_pred = self.unet(
                    latent_model_input,
                    t,
@@ -1,50 +0,0 @@
-from typing import TYPE_CHECKING
-
-from ...utils import (
-    DIFFUSERS_SLOW_IMPORT,
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    get_objects_from_module,
-    is_torch_available,
-    is_transformers_available,
-    is_transformers_version,
-)
-
-
-_dummy_objects = {}
-_import_structure = {}
-
-try:
-    if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils import dummy_torch_and_transformers_objects
-
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-else:
-    _import_structure["modeling_stable_audio"] = ["StableAudioProjectionModel"]
-    _import_structure["pipeline_stable_audio"] = ["StableAudioPipeline"]
-
-
-if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
-    try:
-        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        from ...utils.dummy_torch_and_transformers_objects import *
-
-    else:
-        from .modeling_stable_audio import StableAudioProjectionModel
-        from .pipeline_stable_audio import StableAudioPipeline
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(
-        __name__,
-        globals()["__file__"],
-        _import_structure,
-        module_spec=__spec__,
-    )
-    for name, value in _dummy_objects.items():
-        setattr(sys.modules[__name__], name, value)
@@ -1,158 +0,0 @@
-# Copyright 2024 Stability AI and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from math import pi
-from typing import Optional
-
-import torch
-import torch.nn as nn
-import torch.utils.checkpoint
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...models.modeling_utils import ModelMixin
-from ...utils import BaseOutput, logging
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class StableAudioPositionalEmbedding(nn.Module):
-    """Used for continuous time"""
-
-    def __init__(self, dim: int):
-        super().__init__()
-        assert (dim % 2) == 0
-        half_dim = dim // 2
-        self.weights = nn.Parameter(torch.randn(half_dim))
-
-    def forward(self, times: torch.Tensor) -> torch.Tensor:
-        times = times[..., None]
-        freqs = times * self.weights[None] * 2 * pi
-        fouriered = torch.cat((freqs.sin(), freqs.cos()), dim=-1)
-        fouriered = torch.cat((times, fouriered), dim=-1)
-        return fouriered
-
-
-@dataclass
-class StableAudioProjectionModelOutput(BaseOutput):
-    """
-    Args:
-    Class for StableAudio projection layer's outputs.
-        text_hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Sequence of hidden-states obtained by linearly projecting the hidden-states for the text encoder.
-        seconds_start_hidden_states (`torch.Tensor` of shape `(batch_size, 1, hidden_size)`, *optional*):
-            Sequence of hidden-states obtained by linearly projecting the audio start hidden states.
-        seconds_end_hidden_states (`torch.Tensor` of shape `(batch_size, 1, hidden_size)`, *optional*):
-            Sequence of hidden-states obtained by linearly projecting the audio end hidden states.
-    """
-
-    text_hidden_states: Optional[torch.Tensor] = None
-    seconds_start_hidden_states: Optional[torch.Tensor] = None
-    seconds_end_hidden_states: Optional[torch.Tensor] = None
-
-
-class StableAudioNumberConditioner(nn.Module):
-    """
-    A simple linear projection model to map numbers to a latent space.
-
-    Args:
-        number_embedding_dim (`int`):
-            Dimensionality of the number embeddings.
-        min_value (`int`):
-            The minimum value of the seconds number conditioning modules.
-        max_value (`int`):
-            The maximum value of the seconds number conditioning modules
-        internal_dim (`int`):
-            Dimensionality of the intermediate number hidden states.
-    """
-
-    def __init__(
-        self,
-        number_embedding_dim,
-        min_value,
-        max_value,
-        internal_dim: Optional[int] = 256,
-    ):
-        super().__init__()
-        self.time_positional_embedding = nn.Sequential(
-            StableAudioPositionalEmbedding(internal_dim),
-            nn.Linear(in_features=internal_dim + 1, out_features=number_embedding_dim),
-        )
-
-        self.number_embedding_dim = number_embedding_dim
-        self.min_value = min_value
-        self.max_value = max_value
-
-    def forward(
-        self,
-        floats: torch.Tensor,
-    ):
-        floats = floats.clamp(self.min_value, self.max_value)
-
-        normalized_floats = (floats - self.min_value) / (self.max_value - self.min_value)
-
-        # Cast floats to same type as embedder
-        embedder_dtype = next(self.time_positional_embedding.parameters()).dtype
-        normalized_floats = normalized_floats.to(embedder_dtype)
-
-        embedding = self.time_positional_embedding(normalized_floats)
-        float_embeds = embedding.view(-1, 1, self.number_embedding_dim)
-
-        return float_embeds
-
-
-class StableAudioProjectionModel(ModelMixin, ConfigMixin):
-    """
-    A simple linear projection model to map the conditioning values to a shared latent space.
-
-    Args:
-        text_encoder_dim (`int`):
-            Dimensionality of the text embeddings from the text encoder (T5).
-        conditioning_dim (`int`):
-            Dimensionality of the output conditioning tensors.
-        min_value (`int`):
-            The minimum value of the seconds number conditioning modules.
-        max_value (`int`):
-            The maximum value of the seconds number conditioning modules
-    """
-
-    @register_to_config
-    def __init__(self, text_encoder_dim, conditioning_dim, min_value, max_value):
-        super().__init__()
-        self.text_projection = (
-            nn.Identity() if conditioning_dim == text_encoder_dim else nn.Linear(text_encoder_dim, conditioning_dim)
-        )
-        self.start_number_conditioner = StableAudioNumberConditioner(conditioning_dim, min_value, max_value)
-        self.end_number_conditioner = StableAudioNumberConditioner(conditioning_dim, min_value, max_value)
-
-    def forward(
-        self,
-        text_hidden_states: Optional[torch.Tensor] = None,
-        start_seconds: Optional[torch.Tensor] = None,
-        end_seconds: Optional[torch.Tensor] = None,
-    ):
-        text_hidden_states = (
-            text_hidden_states if text_hidden_states is None else self.text_projection(text_hidden_states)
-        )
-        seconds_start_hidden_states = (
-            start_seconds if start_seconds is None else self.start_number_conditioner(start_seconds)
-        )
-        seconds_end_hidden_states = end_seconds if end_seconds is None else self.end_number_conditioner(end_seconds)
-
-        return StableAudioProjectionModelOutput(
-            text_hidden_states=text_hidden_states,
-            seconds_start_hidden_states=seconds_start_hidden_states,
-            seconds_end_hidden_states=seconds_end_hidden_states,
-        )
@@ -1,745 +0,0 @@
-# Copyright 2024 Stability AI and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, List, Optional, Union
-
-import torch
-from transformers import (
-    T5EncoderModel,
-    T5Tokenizer,
-    T5TokenizerFast,
-)
-
-from ...models import AutoencoderOobleck, StableAudioDiTModel
-from ...models.embeddings import get_1d_rotary_pos_embed
-from ...schedulers import EDMDPMSolverMultistepScheduler
-from ...utils import (
-    logging,
-    replace_example_docstring,
-)
-from ...utils.torch_utils import randn_tensor
-from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
-from .modeling_stable_audio import StableAudioProjectionModel
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```py
-        >>> import scipy
-        >>> import torch
-        >>> import soundfile as sf
-        >>> from diffusers import StableAudioPipeline
-
-        >>> repo_id = "stabilityai/stable-audio-open-1.0"
-        >>> pipe = StableAudioPipeline.from_pretrained(repo_id, torch_dtype=torch.float16)
-        >>> pipe = pipe.to("cuda")
-
-        >>> # define the prompts
-        >>> prompt = "The sound of a hammer hitting a wooden surface."
-        >>> negative_prompt = "Low quality."
-
-        >>> # set the seed for generator
-        >>> generator = torch.Generator("cuda").manual_seed(0)
-
-        >>> # run the generation
-        >>> audio = pipe(
-        ...     prompt,
-        ...     negative_prompt=negative_prompt,
-        ...     num_inference_steps=200,
-        ...     audio_end_in_s=10.0,
-        ...     num_waveforms_per_prompt=3,
-        ...     generator=generator,
-        ... ).audios
-
-        >>> output = audio[0].T.float().cpu().numpy()
-        >>> sf.write("hammer.wav", output, pipe.vae.sampling_rate)
-        ```
-"""
-
-
-class StableAudioPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-to-audio generation using StableAudio.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
-    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
-
-    Args:
-        vae ([`AutoencoderOobleck`]):
-            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
-        text_encoder ([`~transformers.T5EncoderModel`]):
-            Frozen text-encoder. StableAudio uses the encoder of
-            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel), specifically the
-            [google-t5/t5-base](https://huggingface.co/google-t5/t5-base) variant.
-        projection_model ([`StableAudioProjectionModel`]):
-            A trained model used to linearly project the hidden-states from the text encoder model and the start and
-            end seconds. The projected hidden-states from the encoder and the conditional seconds are concatenated to
-            give the input to the transformer model.
-        tokenizer ([`~transformers.T5Tokenizer`]):
-            Tokenizer to tokenize text for the frozen text-encoder.
-        transformer ([`StableAudioDiTModel`]):
-            A `StableAudioDiTModel` to denoise the encoded audio latents.
-        scheduler ([`EDMDPMSolverMultistepScheduler`]):
-            A scheduler to be used in combination with `transformer` to denoise the encoded audio latents.
-    """
-
-    model_cpu_offload_seq = "text_encoder->projection_model->transformer->vae"
-
-    def __init__(
-        self,
-        vae: AutoencoderOobleck,
-        text_encoder: T5EncoderModel,
-        projection_model: StableAudioProjectionModel,
-        tokenizer: Union[T5Tokenizer, T5TokenizerFast],
-        transformer: StableAudioDiTModel,
-        scheduler: EDMDPMSolverMultistepScheduler,
-    ):
-        super().__init__()
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            projection_model=projection_model,
-            tokenizer=tokenizer,
-            transformer=transformer,
-            scheduler=scheduler,
-        )
-        self.rotary_embed_dim = self.transformer.config.attention_head_dim // 2
-
-    # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.enable_vae_slicing
-    def enable_vae_slicing(self):
-        r"""
-        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
-        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
-        """
-        self.vae.enable_slicing()
-
-    # Copied from diffusers.pipelines.pipeline_utils.StableDiffusionMixin.disable_vae_slicing
-    def disable_vae_slicing(self):
-        r"""
-        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_slicing()
-
-    def encode_prompt(
-        self,
-        prompt,
-        device,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.LongTensor] = None,
-        negative_attention_mask: Optional[torch.LongTensor] = None,
-    ):
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # 1. Tokenize text
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            attention_mask = text_inputs.attention_mask
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    f"The following part of your input was truncated because {self.text_encoder.config.model_type} can "
-                    f"only handle sequences up to {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            text_input_ids = text_input_ids.to(device)
-            attention_mask = attention_mask.to(device)
-
-            # 2. Text encoder forward
-            self.text_encoder.eval()
-            prompt_embeds = self.text_encoder(
-                text_input_ids,
-                attention_mask=attention_mask,
-            )
-            prompt_embeds = prompt_embeds[0]
-
-        if do_classifier_free_guidance and negative_prompt is not None:
-            uncond_tokens: List[str]
-            if type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # 1. Tokenize text
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-
-            uncond_input_ids = uncond_input.input_ids.to(device)
-            negative_attention_mask = uncond_input.attention_mask.to(device)
-
-            # 2. Text encoder forward
-            self.text_encoder.eval()
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input_ids,
-                attention_mask=negative_attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-            if negative_attention_mask is not None:
-                # set the masked tokens to the null embed
-                negative_prompt_embeds = torch.where(
-                    negative_attention_mask.to(torch.bool).unsqueeze(2), negative_prompt_embeds, 0.0
-                )
-
-        # 3. Project prompt_embeds and negative_prompt_embeds
-        if do_classifier_free_guidance and negative_prompt_embeds is not None:
-            # For classifier free guidance, we need to do two forward passes.
-            # Here we concatenate the negative and text embeddings into a single batch
-            # to avoid doing two forward passes
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-            if attention_mask is not None and negative_attention_mask is None:
-                negative_attention_mask = torch.ones_like(attention_mask)
-            elif attention_mask is None and negative_attention_mask is not None:
-                attention_mask = torch.ones_like(negative_attention_mask)
-
-            if attention_mask is not None:
-                attention_mask = torch.cat([negative_attention_mask, attention_mask])
-
-        prompt_embeds = self.projection_model(
-            text_hidden_states=prompt_embeds,
-        ).text_hidden_states
-        if attention_mask is not None:
-            prompt_embeds = prompt_embeds * attention_mask.unsqueeze(-1).to(prompt_embeds.dtype)
-            prompt_embeds = prompt_embeds * attention_mask.unsqueeze(-1).to(prompt_embeds.dtype)
-
-        return prompt_embeds
-
-    def encode_duration(
-        self,
-        audio_start_in_s,
-        audio_end_in_s,
-        device,
-        do_classifier_free_guidance,
-        batch_size,
-    ):
-        audio_start_in_s = audio_start_in_s if isinstance(audio_start_in_s, list) else [audio_start_in_s]
-        audio_end_in_s = audio_end_in_s if isinstance(audio_end_in_s, list) else [audio_end_in_s]
-
-        if len(audio_start_in_s) == 1:
-            audio_start_in_s = audio_start_in_s * batch_size
-        if len(audio_end_in_s) == 1:
-            audio_end_in_s = audio_end_in_s * batch_size
-
-        # Cast the inputs to floats
-        audio_start_in_s = [float(x) for x in audio_start_in_s]
-        audio_start_in_s = torch.tensor(audio_start_in_s).to(device)
-
-        audio_end_in_s = [float(x) for x in audio_end_in_s]
-        audio_end_in_s = torch.tensor(audio_end_in_s).to(device)
-
-        projection_output = self.projection_model(
-            start_seconds=audio_start_in_s,
-            end_seconds=audio_end_in_s,
-        )
-        seconds_start_hidden_states = projection_output.seconds_start_hidden_states
-        seconds_end_hidden_states = projection_output.seconds_end_hidden_states
-
-        # For classifier free guidance, we need to do two forward passes.
-        # Here we repeat the audio hidden states to avoid doing two forward passes
-        if do_classifier_free_guidance:
-            seconds_start_hidden_states = torch.cat([seconds_start_hidden_states, seconds_start_hidden_states], dim=0)
-            seconds_end_hidden_states = torch.cat([seconds_end_hidden_states, seconds_end_hidden_states], dim=0)
-
-        return seconds_start_hidden_states, seconds_end_hidden_states
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        audio_start_in_s,
-        audio_end_in_s,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-        attention_mask=None,
-        negative_attention_mask=None,
-        initial_audio_waveforms=None,
-        initial_audio_sampling_rate=None,
-    ):
-        if audio_end_in_s < audio_start_in_s:
-            raise ValueError(
-                f"`audio_end_in_s={audio_end_in_s}' must be higher than 'audio_start_in_s={audio_start_in_s}` but "
-            )
-
-        if (
-            audio_start_in_s < self.projection_model.config.min_value
-            or audio_start_in_s > self.projection_model.config.max_value
-        ):
-            raise ValueError(
-                f"`audio_start_in_s` must be greater than or equal to {self.projection_model.config.min_value}, and lower than or equal to {self.projection_model.config.max_value} but "
-                f"is {audio_start_in_s}."
-            )
-
-        if (
-            audio_end_in_s < self.projection_model.config.min_value
-            or audio_end_in_s > self.projection_model.config.max_value
-        ):
-            raise ValueError(
-                f"`audio_end_in_s` must be greater than or equal to {self.projection_model.config.min_value}, and lower than or equal to {self.projection_model.config.max_value} but "
-                f"is {audio_end_in_s}."
-            )
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and (prompt_embeds is None):
-            raise ValueError(
-                "Provide either `prompt`, or `prompt_embeds`. Cannot leave"
-                "`prompt` undefined without specifying `prompt_embeds`."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-            if attention_mask is not None and attention_mask.shape != prompt_embeds.shape[:2]:
-                raise ValueError(
-                    "`attention_mask should have the same batch size and sequence length as `prompt_embeds`, but got:"
-                    f"`attention_mask: {attention_mask.shape} != `prompt_embeds` {prompt_embeds.shape}"
-                )
-
-        if initial_audio_sampling_rate is None and initial_audio_waveforms is not None:
-            raise ValueError(
-                "`initial_audio_waveforms' is provided but the sampling rate is not. Make sure to pass `initial_audio_sampling_rate`."
-            )
-
-        if initial_audio_sampling_rate is not None and initial_audio_sampling_rate != self.vae.sampling_rate:
-            raise ValueError(
-                f"`initial_audio_sampling_rate` must be {self.vae.hop_length}' but is `{initial_audio_sampling_rate}`."
-                "Make sure to resample the `initial_audio_waveforms` and to correct the sampling rate. "
-            )
-
-    def prepare_latents(
-        self,
-        batch_size,
-        num_channels_vae,
-        sample_size,
-        dtype,
-        device,
-        generator,
-        latents=None,
-        initial_audio_waveforms=None,
-        num_waveforms_per_prompt=None,
-        audio_channels=None,
-    ):
-        shape = (batch_size, num_channels_vae, sample_size)
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-
-        # encode the initial audio for use by the model
-        if initial_audio_waveforms is not None:
-            # check dimension
-            if initial_audio_waveforms.ndim == 2:
-                initial_audio_waveforms = initial_audio_waveforms.unsqueeze(1)
-            elif initial_audio_waveforms.ndim != 3:
-                raise ValueError(
-                    f"`initial_audio_waveforms` must be of shape `(batch_size, num_channels, audio_length)` or `(batch_size, audio_length)` but has `{initial_audio_waveforms.ndim}` dimensions"
-                )
-
-            audio_vae_length = self.transformer.config.sample_size * self.vae.hop_length
-            audio_shape = (batch_size // num_waveforms_per_prompt, audio_channels, audio_vae_length)
-
-            # check num_channels
-            if initial_audio_waveforms.shape[1] == 1 and audio_channels == 2:
-                initial_audio_waveforms = initial_audio_waveforms.repeat(1, 2, 1)
-            elif initial_audio_waveforms.shape[1] == 2 and audio_channels == 1:
-                initial_audio_waveforms = initial_audio_waveforms.mean(1, keepdim=True)
-
-            if initial_audio_waveforms.shape[:2] != audio_shape[:2]:
-                raise ValueError(
-                    f"`initial_audio_waveforms` must be of shape `(batch_size, num_channels, audio_length)` or `(batch_size, audio_length)` but is of shape `{initial_audio_waveforms.shape}`"
-                )
-
-            # crop or pad
-            audio_length = initial_audio_waveforms.shape[-1]
-            if audio_length < audio_vae_length:
-                logger.warning(
-                    f"The provided input waveform is shorter ({audio_length}) than the required audio length ({audio_vae_length}) of the model and will thus be padded."
-                )
-            elif audio_length > audio_vae_length:
-                logger.warning(
-                    f"The provided input waveform is longer ({audio_length}) than the required audio length ({audio_vae_length}) of the model and will thus be cropped."
-                )
-
-            audio = initial_audio_waveforms.new_zeros(audio_shape)
-            audio[:, :, : min(audio_length, audio_vae_length)] = initial_audio_waveforms[:, :, :audio_vae_length]
-
-            encoded_audio = self.vae.encode(audio).latent_dist.sample(generator)
-            encoded_audio = encoded_audio.repeat((num_waveforms_per_prompt, 1, 1))
-            latents = encoded_audio + latents
-        return latents
-
-    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        audio_end_in_s: Optional[float] = None,
-        audio_start_in_s: Optional[float] = 0.0,
-        num_inference_steps: int = 100,
-        guidance_scale: float = 7.0,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_waveforms_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-        initial_audio_waveforms: Optional[torch.Tensor] = None,
-        initial_audio_sampling_rate: Optional[torch.Tensor] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.LongTensor] = None,
-        negative_attention_mask: Optional[torch.LongTensor] = None,
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.Tensor], None]] = None,
-        callback_steps: Optional[int] = 1,
-        output_type: Optional[str] = "pt",
-    ):
-        r"""
-        The call function to the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide audio generation. If not defined, you need to pass `prompt_embeds`.
-            audio_end_in_s (`float`, *optional*, defaults to 47.55):
-                Audio end index in seconds.
-            audio_start_in_s (`float`, *optional*, defaults to 0):
-                Audio start index in seconds.
-            num_inference_steps (`int`, *optional*, defaults to 100):
-                The number of denoising steps. More denoising steps usually lead to a higher quality audio at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 7.0):
-                A higher guidance scale value encourages the model to generate audio that is closely linked to the text
-                `prompt` at the expense of lower sound quality. Guidance scale is enabled when `guidance_scale > 1`.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide what to not include in audio generation. If not defined, you need to
-                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
-            num_waveforms_per_prompt (`int`, *optional*, defaults to 1):
-                The number of waveforms to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
-                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
-                generation deterministic.
-            latents (`torch.Tensor`, *optional*):
-                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for audio
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor is generated by sampling using the supplied random `generator`.
-            initial_audio_waveforms (`torch.Tensor`, *optional*):
-                Optional initial audio waveforms to use as the initial audio waveform for generation. Must be of shape
-                `(batch_size, num_channels, audio_length)` or `(batch_size, audio_length)`, where `batch_size`
-                corresponds to the number of prompts passed to the model.
-            initial_audio_sampling_rate (`int`, *optional*):
-                Sampling rate of the `initial_audio_waveforms`, if they are provided. Must be the same as the model.
-            prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-computed text embeddings from the text encoder model. Can be used to easily tweak text inputs,
-                *e.g.* prompt weighting. If not provided, text embeddings will be computed from `prompt` input
-                argument.
-            negative_prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-computed negative text embeddings from the text encoder model. Can be used to easily tweak text
-                inputs, *e.g.* prompt weighting. If not provided, negative_prompt_embeds will be computed from
-                `negative_prompt` input argument.
-            attention_mask (`torch.LongTensor`, *optional*):
-                Pre-computed attention mask to be applied to the `prompt_embeds`. If not provided, attention mask will
-                be computed from `prompt` input argument.
-            negative_attention_mask (`torch.LongTensor`, *optional*):
-                Pre-computed attention mask to be applied to the `negative_text_audio_duration_embeds`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that calls every `callback_steps` steps during inference. The function is called with the
-                following arguments: `callback(step: int, timestep: int, latents: torch.Tensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function is called. If not specified, the callback is called at
-                every step.
-            output_type (`str`, *optional*, defaults to `"pt"`):
-                The output format of the generated audio. Choose between `"np"` to return a NumPy `np.ndarray` or
-                `"pt"` to return a PyTorch `torch.Tensor` object. Set to `"latent"` to return the latent diffusion
-                model (LDM) output.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
-                otherwise a `tuple` is returned where the first element is a list with the generated audio.
-        """
-        # 0. Convert audio input length from seconds to latent length
-        downsample_ratio = self.vae.hop_length
-
-        max_audio_length_in_s = self.transformer.config.sample_size * downsample_ratio / self.vae.config.sampling_rate
-        if audio_end_in_s is None:
-            audio_end_in_s = max_audio_length_in_s
-
-        if audio_end_in_s - audio_start_in_s > max_audio_length_in_s:
-            raise ValueError(
-                f"The total audio length requested ({audio_end_in_s-audio_start_in_s}s) is longer than the model maximum possible length ({max_audio_length_in_s}). Make sure that 'audio_end_in_s-audio_start_in_s<={max_audio_length_in_s}'."
-            )
-
-        waveform_start = int(audio_start_in_s * self.vae.config.sampling_rate)
-        waveform_end = int(audio_end_in_s * self.vae.config.sampling_rate)
-        waveform_length = int(self.transformer.config.sample_size)
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            audio_start_in_s,
-            audio_end_in_s,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            attention_mask,
-            negative_attention_mask,
-            initial_audio_waveforms,
-            initial_audio_sampling_rate,
-        )
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        device = self._execution_device
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds = self.encode_prompt(
-            prompt,
-            device,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-            attention_mask,
-            negative_attention_mask,
-        )
-
-        # Encode duration
-        seconds_start_hidden_states, seconds_end_hidden_states = self.encode_duration(
-            audio_start_in_s,
-            audio_end_in_s,
-            device,
-            do_classifier_free_guidance and (negative_prompt is not None or negative_prompt_embeds is not None),
-            batch_size,
-        )
-
-        # Create text_audio_duration_embeds and audio_duration_embeds
-        text_audio_duration_embeds = torch.cat(
-            [prompt_embeds, seconds_start_hidden_states, seconds_end_hidden_states], dim=1
-        )
-
-        audio_duration_embeds = torch.cat([seconds_start_hidden_states, seconds_end_hidden_states], dim=2)
-
-        # In case of classifier free guidance without negative prompt, we need to create unconditional embeddings and
-        # to concatenate it to the embeddings
-        if do_classifier_free_guidance and negative_prompt_embeds is None and negative_prompt is None:
-            negative_text_audio_duration_embeds = torch.zeros_like(
-                text_audio_duration_embeds, device=text_audio_duration_embeds.device
-            )
-            text_audio_duration_embeds = torch.cat(
-                [negative_text_audio_duration_embeds, text_audio_duration_embeds], dim=0
-            )
-            audio_duration_embeds = torch.cat([audio_duration_embeds, audio_duration_embeds], dim=0)
-
-        bs_embed, seq_len, hidden_size = text_audio_duration_embeds.shape
-        # duplicate audio_duration_embeds and text_audio_duration_embeds for each generation per prompt, using mps friendly method
-        text_audio_duration_embeds = text_audio_duration_embeds.repeat(1, num_waveforms_per_prompt, 1)
-        text_audio_duration_embeds = text_audio_duration_embeds.view(
-            bs_embed * num_waveforms_per_prompt, seq_len, hidden_size
-        )
-
-        audio_duration_embeds = audio_duration_embeds.repeat(1, num_waveforms_per_prompt, 1)
-        audio_duration_embeds = audio_duration_embeds.view(
-            bs_embed * num_waveforms_per_prompt, -1, audio_duration_embeds.shape[-1]
-        )
-
-        # 4. Prepare timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-
-        # 5. Prepare latent variables
-        num_channels_vae = self.transformer.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_waveforms_per_prompt,
-            num_channels_vae,
-            waveform_length,
-            text_audio_duration_embeds.dtype,
-            device,
-            generator,
-            latents,
-            initial_audio_waveforms,
-            num_waveforms_per_prompt,
-            audio_channels=self.vae.config.audio_channels,
-        )
-
-        # 6. Prepare extra step kwargs
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 7. Prepare rotary positional embedding
-        rotary_embedding = get_1d_rotary_pos_embed(
-            self.rotary_embed_dim,
-            latents.shape[2] + audio_duration_embeds.shape[1],
-            use_real=True,
-            repeat_interleave_real=False,
-        )
-
-        # 8. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-
-                # predict the noise residual
-                noise_pred = self.transformer(
-                    latent_model_input,
-                    t.unsqueeze(0),
-                    encoder_hidden_states=text_audio_duration_embeds,
-                    global_hidden_states=audio_duration_embeds,
-                    rotary_embedding=rotary_embedding,
-                    return_dict=False,
-                )[0]
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        step_idx = i // getattr(self.scheduler, "order", 1)
-                        callback(step_idx, t, latents)
-
-        # 9. Post-processing
-        if not output_type == "latent":
-            audio = self.vae.decode(latents).sample
-        else:
-            return AudioPipelineOutput(audios=latents)
-
-        audio = audio[:, :, waveform_start:waveform_end]
-
-        if output_type == "np":
-            audio = audio.cpu().float().numpy()
-
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (audio,)
-
-        return AudioPipelineOutput(audios=audio)
@@ -1370,8 +1370,6 @@ def download_from_original_stable_diffusion_ckpt(

    if "unet_config" in original_config["model"]["params"]:
        original_config["model"]["params"]["unet_config"]["params"]["in_channels"] = num_in_channels
-    elif "network_config" in original_config["model"]["params"]:
-        original_config["model"]["params"]["network_config"]["params"]["in_channels"] = num_in_channels

    if (
        "parameterization" in original_config["model"]["params"]
@@ -710,7 +710,7 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
-            guidance_scale (`float`, *optional*, defaults to 7.0):
+            guidance_scale (`float`, *optional*, defaults to 5.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
@@ -752,7 +752,7 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline):
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
-            guidance_scale (`float`, *optional*, defaults to 7.0):
+            guidance_scale (`float`, *optional*, defaults to 5.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
@@ -878,7 +878,7 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline):
                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
                passed will be used. Must be in descending order.
-            guidance_scale (`float`, *optional*, defaults to 7.0):
+            guidance_scale (`float`, *optional*, defaults to 5.0):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
@@ -118,7 +118,6 @@ except OptionalDependencyNotAvailable:
    _dummy_modules.update(get_objects_from_module(dummy_torch_and_torchsde_objects))

 else:
-    _import_structure["scheduling_cosine_dpmsolver_multistep"] = ["CosineDPMSolverMultistepScheduler"]
    _import_structure["scheduling_dpmsolver_sde"] = ["DPMSolverSDEScheduler"]

 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -206,7 +205,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    except OptionalDependencyNotAvailable:
        from ..utils.dummy_torch_and_torchsde_objects import *  # noqa F403
    else:
-        from .scheduling_cosine_dpmsolver_multistep import CosineDPMSolverMultistepScheduler
        from .scheduling_dpmsolver_sde import DPMSolverSDEScheduler

 else:
@@ -1,572 +0,0 @@
-# Copyright 2024 TSAIL Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver and https://github.com/NVlabs/edm
-
-import math
-from typing import List, Optional, Tuple, Union
-
-import numpy as np
-import torch
-
-from ..configuration_utils import ConfigMixin, register_to_config
-from .scheduling_dpmsolver_sde import BrownianTreeNoiseSampler
-from .scheduling_utils import SchedulerMixin, SchedulerOutput
-
-
-class CosineDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
-    """
-    Implements a variant of `DPMSolverMultistepScheduler` with cosine schedule, proposed by Nichol and Dhariwal (2021).
-    This scheduler was used in Stable Audio Open [1].
-
-    [1] Evans, Parker, et al. "Stable Audio Open" https://arxiv.org/abs/2407.14358
-
-    This model inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the generic
-    methods the library implements for all schedulers such as loading and saving.
-
-    Args:
-        sigma_min (`float`, *optional*, defaults to 0.3):
-            Minimum noise magnitude in the sigma schedule. This was set to 0.3 in Stable Audio Open [1].
-        sigma_max (`float`, *optional*, defaults to 500):
-            Maximum noise magnitude in the sigma schedule. This was set to 500 in Stable Audio Open [1].
-        sigma_data (`float`, *optional*, defaults to 1.0):
-            The standard deviation of the data distribution. This is set to 1.0 in Stable Audio Open [1].
-        sigma_schedule (`str`, *optional*, defaults to `exponential`):
-            Sigma schedule to compute the `sigmas`. By default, we the schedule introduced in the EDM paper
-            (https://arxiv.org/abs/2206.00364). Other acceptable value is "exponential". The exponential schedule was
-            incorporated in this model: https://huggingface.co/stabilityai/cosxl.
-        num_train_timesteps (`int`, defaults to 1000):
-            The number of diffusion steps to train the model.
-        solver_order (`int`, defaults to 2):
-            The DPMSolver order which can be `1` or `2`. It is recommended to use `solver_order=2`.
-        prediction_type (`str`, defaults to `v_prediction`, *optional*):
-            Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
-            `sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
-            Video](https://imagen.research.google/video/paper.pdf) paper).
-        solver_type (`str`, defaults to `midpoint`):
-            Solver type for the second-order solver; can be `midpoint` or `heun`. The solver type slightly affects the
-            sample quality, especially for a small number of steps. It is recommended to use `midpoint` solvers.
-        lower_order_final (`bool`, defaults to `True`):
-            Whether to use lower-order solvers in the final steps. Only valid for < 15 inference steps. This can
-            stabilize the sampling of DPMSolver for steps < 15, especially for steps <= 10.
-        euler_at_final (`bool`, defaults to `False`):
-            Whether to use Euler's method in the final step. It is a trade-off between numerical stability and detail
-            richness. This can stabilize the sampling of the SDE variant of DPMSolver for small number of inference
-            steps, but sometimes may result in blurring.
-        final_sigmas_type (`str`, defaults to `"zero"`):
-            The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
-            sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
-    """
-
-    _compatibles = []
-    order = 1
-
-    @register_to_config
-    def __init__(
-        self,
-        sigma_min: float = 0.3,
-        sigma_max: float = 500,
-        sigma_data: float = 1.0,
-        sigma_schedule: str = "exponential",
-        num_train_timesteps: int = 1000,
-        solver_order: int = 2,
-        prediction_type: str = "v_prediction",
-        rho: float = 7.0,
-        solver_type: str = "midpoint",
-        lower_order_final: bool = True,
-        euler_at_final: bool = False,
-        final_sigmas_type: Optional[str] = "zero",  # "zero", "sigma_min"
-    ):
-        if solver_type not in ["midpoint", "heun"]:
-            if solver_type in ["logrho", "bh1", "bh2"]:
-                self.register_to_config(solver_type="midpoint")
-            else:
-                raise NotImplementedError(f"{solver_type} is not implemented for {self.__class__}")
-
-        ramp = torch.linspace(0, 1, num_train_timesteps)
-        if sigma_schedule == "karras":
-            sigmas = self._compute_karras_sigmas(ramp)
-        elif sigma_schedule == "exponential":
-            sigmas = self._compute_exponential_sigmas(ramp)
-
-        self.timesteps = self.precondition_noise(sigmas)
-
-        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
-
-        # setable values
-        self.num_inference_steps = None
-        self.model_outputs = [None] * solver_order
-        self.lower_order_nums = 0
-        self._step_index = None
-        self._begin_index = None
-        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
-
-    @property
-    def init_noise_sigma(self):
-        # standard deviation of the initial noise distribution
-        return (self.config.sigma_max**2 + 1) ** 0.5
-
-    @property
-    def step_index(self):
-        """
-        The index counter for current timestep. It will increase 1 after each scheduler step.
-        """
-        return self._step_index
-
-    @property
-    def begin_index(self):
-        """
-        The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
-        """
-        return self._begin_index
-
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
-    def set_begin_index(self, begin_index: int = 0):
-        """
-        Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
-
-        Args:
-            begin_index (`int`):
-                The begin index for the scheduler.
-        """
-        self._begin_index = begin_index
-
-    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.precondition_inputs
-    def precondition_inputs(self, sample, sigma):
-        c_in = 1 / ((sigma**2 + self.config.sigma_data**2) ** 0.5)
-        scaled_sample = sample * c_in
-        return scaled_sample
-
-    def precondition_noise(self, sigma):
-        if not isinstance(sigma, torch.Tensor):
-            sigma = torch.tensor([sigma])
-
-        return sigma.atan() / math.pi * 2
-
-    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.precondition_outputs
-    def precondition_outputs(self, sample, model_output, sigma):
-        sigma_data = self.config.sigma_data
-        c_skip = sigma_data**2 / (sigma**2 + sigma_data**2)
-
-        if self.config.prediction_type == "epsilon":
-            c_out = sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5
-        elif self.config.prediction_type == "v_prediction":
-            c_out = -sigma * sigma_data / (sigma**2 + sigma_data**2) ** 0.5
-        else:
-            raise ValueError(f"Prediction type {self.config.prediction_type} is not supported.")
-
-        denoised = c_skip * sample + c_out * model_output
-
-        return denoised
-
-    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.scale_model_input
-    def scale_model_input(self, sample: torch.Tensor, timestep: Union[float, torch.Tensor]) -> torch.Tensor:
-        """
-        Ensures interchangeability with schedulers that need to scale the denoising model input depending on the
-        current timestep. Scales the denoising model input by `(sigma**2 + 1) ** 0.5` to match the Euler algorithm.
-
-        Args:
-            sample (`torch.Tensor`):
-                The input sample.
-            timestep (`int`, *optional*):
-                The current timestep in the diffusion chain.
-
-        Returns:
-            `torch.Tensor`:
-                A scaled input sample.
-        """
-        if self.step_index is None:
-            self._init_step_index(timestep)
-
-        sigma = self.sigmas[self.step_index]
-        sample = self.precondition_inputs(sample, sigma)
-
-        self.is_scale_input_called = True
-        return sample
-
-    def set_timesteps(self, num_inference_steps: int = None, device: Union[str, torch.device] = None):
-        """
-        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
-
-        Args:
-            num_inference_steps (`int`):
-                The number of diffusion steps used when generating samples with a pre-trained model.
-            device (`str` or `torch.device`, *optional*):
-                The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        """
-
-        self.num_inference_steps = num_inference_steps
-
-        ramp = torch.linspace(0, 1, self.num_inference_steps)
-        if self.config.sigma_schedule == "karras":
-            sigmas = self._compute_karras_sigmas(ramp)
-        elif self.config.sigma_schedule == "exponential":
-            sigmas = self._compute_exponential_sigmas(ramp)
-
-        sigmas = sigmas.to(dtype=torch.float32, device=device)
-        self.timesteps = self.precondition_noise(sigmas)
-
-        if self.config.final_sigmas_type == "sigma_min":
-            sigma_last = self.config.sigma_min
-        elif self.config.final_sigmas_type == "zero":
-            sigma_last = 0
-        else:
-            raise ValueError(
-                f"`final_sigmas_type` must be one of 'zero', or 'sigma_min', but got {self.config.final_sigmas_type}"
-            )
-
-        self.sigmas = torch.cat([sigmas, torch.tensor([sigma_last], dtype=torch.float32, device=device)])
-
-        self.model_outputs = [
-            None,
-        ] * self.config.solver_order
-        self.lower_order_nums = 0
-
-        # add an index counter for schedulers that allow duplicated timesteps
-        self._step_index = None
-        self._begin_index = None
-        self.sigmas = self.sigmas.to("cpu")  # to avoid too much CPU/GPU communication
-
-        # if a noise sampler is used, reinitialise it
-        self.noise_sampler = None
-
-    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_karras_sigmas
-    def _compute_karras_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
-        """Constructs the noise schedule of Karras et al. (2022)."""
-        sigma_min = sigma_min or self.config.sigma_min
-        sigma_max = sigma_max or self.config.sigma_max
-
-        rho = self.config.rho
-        min_inv_rho = sigma_min ** (1 / rho)
-        max_inv_rho = sigma_max ** (1 / rho)
-        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
-        return sigmas
-
-    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._compute_exponential_sigmas
-    def _compute_exponential_sigmas(self, ramp, sigma_min=None, sigma_max=None) -> torch.Tensor:
-        """Implementation closely follows k-diffusion.
-
-        https://github.com/crowsonkb/k-diffusion/blob/6ab5146d4a5ef63901326489f31f1d8e7dd36b48/k_diffusion/sampling.py#L26
-        """
-        sigma_min = sigma_min or self.config.sigma_min
-        sigma_max = sigma_max or self.config.sigma_max
-        sigmas = torch.linspace(math.log(sigma_min), math.log(sigma_max), len(ramp)).exp().flip(0)
-        return sigmas
-
-    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
-    def _sigma_to_t(self, sigma, log_sigmas):
-        # get log sigma
-        log_sigma = np.log(np.maximum(sigma, 1e-10))
-
-        # get distribution
-        dists = log_sigma - log_sigmas[:, np.newaxis]
-
-        # get sigmas range
-        low_idx = np.cumsum((dists >= 0), axis=0).argmax(axis=0).clip(max=log_sigmas.shape[0] - 2)
-        high_idx = low_idx + 1
-
-        low = log_sigmas[low_idx]
-        high = log_sigmas[high_idx]
-
-        # interpolate sigmas
-        w = (low - log_sigma) / (low - high)
-        w = np.clip(w, 0, 1)
-
-        # transform interpolation to time range
-        t = (1 - w) * low_idx + w * high_idx
-        t = t.reshape(sigma.shape)
-        return t
-
-    def _sigma_to_alpha_sigma_t(self, sigma):
-        alpha_t = torch.tensor(1)  # Inputs are pre-scaled before going into unet, so alpha_t = 1
-        sigma_t = sigma
-
-        return alpha_t, sigma_t
-
-    def convert_model_output(
-        self,
-        model_output: torch.Tensor,
-        sample: torch.Tensor = None,
-    ) -> torch.Tensor:
-        """
-        Convert the model output to the corresponding type the DPMSolver/DPMSolver++ algorithm needs. DPM-Solver is
-        designed to discretize an integral of the noise prediction model, and DPM-Solver++ is designed to discretize an
-        integral of the data prediction model.
-
-        <Tip>
-
-        The algorithm and model type are decoupled. You can use either DPMSolver or DPMSolver++ for both noise
-        prediction and data prediction models.
-
-        </Tip>
-
-        Args:
-            model_output (`torch.Tensor`):
-                The direct output from the learned diffusion model.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by the diffusion process.
-
-        Returns:
-            `torch.Tensor`:
-                The converted model output.
-        """
-        sigma = self.sigmas[self.step_index]
-        x0_pred = self.precondition_outputs(sample, model_output, sigma)
-
-        return x0_pred
-
-    def dpm_solver_first_order_update(
-        self,
-        model_output: torch.Tensor,
-        sample: torch.Tensor = None,
-        noise: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """
-        One step for the first-order DPMSolver (equivalent to DDIM).
-
-        Args:
-            model_output (`torch.Tensor`):
-                The direct output from the learned diffusion model.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by the diffusion process.
-
-        Returns:
-            `torch.Tensor`:
-                The sample tensor at the previous timestep.
-        """
-        sigma_t, sigma_s = self.sigmas[self.step_index + 1], self.sigmas[self.step_index]
-        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
-        alpha_s, sigma_s = self._sigma_to_alpha_sigma_t(sigma_s)
-        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
-        lambda_s = torch.log(alpha_s) - torch.log(sigma_s)
-
-        h = lambda_t - lambda_s
-        assert noise is not None
-        x_t = (
-            (sigma_t / sigma_s * torch.exp(-h)) * sample
-            + (alpha_t * (1 - torch.exp(-2.0 * h))) * model_output
-            + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
-        )
-
-        return x_t
-
-    def multistep_dpm_solver_second_order_update(
-        self,
-        model_output_list: List[torch.Tensor],
-        sample: torch.Tensor = None,
-        noise: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """
-        One step for the second-order multistep DPMSolver.
-
-        Args:
-            model_output_list (`List[torch.Tensor]`):
-                The direct outputs from learned diffusion model at current and latter timesteps.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by the diffusion process.
-
-        Returns:
-            `torch.Tensor`:
-                The sample tensor at the previous timestep.
-        """
-        sigma_t, sigma_s0, sigma_s1 = (
-            self.sigmas[self.step_index + 1],
-            self.sigmas[self.step_index],
-            self.sigmas[self.step_index - 1],
-        )
-
-        alpha_t, sigma_t = self._sigma_to_alpha_sigma_t(sigma_t)
-        alpha_s0, sigma_s0 = self._sigma_to_alpha_sigma_t(sigma_s0)
-        alpha_s1, sigma_s1 = self._sigma_to_alpha_sigma_t(sigma_s1)
-
-        lambda_t = torch.log(alpha_t) - torch.log(sigma_t)
-        lambda_s0 = torch.log(alpha_s0) - torch.log(sigma_s0)
-        lambda_s1 = torch.log(alpha_s1) - torch.log(sigma_s1)
-
-        m0, m1 = model_output_list[-1], model_output_list[-2]
-
-        h, h_0 = lambda_t - lambda_s0, lambda_s0 - lambda_s1
-        r0 = h_0 / h
-        D0, D1 = m0, (1.0 / r0) * (m0 - m1)
-
-        # sde-dpmsolver++
-        assert noise is not None
-        if self.config.solver_type == "midpoint":
-            x_t = (
-                (sigma_t / sigma_s0 * torch.exp(-h)) * sample
-                + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
-                + 0.5 * (alpha_t * (1 - torch.exp(-2.0 * h))) * D1
-                + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
-            )
-        elif self.config.solver_type == "heun":
-            x_t = (
-                (sigma_t / sigma_s0 * torch.exp(-h)) * sample
-                + (alpha_t * (1 - torch.exp(-2.0 * h))) * D0
-                + (alpha_t * ((1.0 - torch.exp(-2.0 * h)) / (-2.0 * h) + 1.0)) * D1
-                + sigma_t * torch.sqrt(1.0 - torch.exp(-2 * h)) * noise
-            )
-
-        return x_t
-
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.index_for_timestep
-    def index_for_timestep(self, timestep, schedule_timesteps=None):
-        if schedule_timesteps is None:
-            schedule_timesteps = self.timesteps
-
-        index_candidates = (schedule_timesteps == timestep).nonzero()
-
-        if len(index_candidates) == 0:
-            step_index = len(self.timesteps) - 1
-        # The sigma index that is taken for the **very** first `step`
-        # is always the second index (or the last index if there is only 1)
-        # This way we can ensure we don't accidentally skip a sigma in
-        # case we start in the middle of the denoising schedule (e.g. for image-to-image)
-        elif len(index_candidates) > 1:
-            step_index = index_candidates[1].item()
-        else:
-            step_index = index_candidates[0].item()
-
-        return step_index
-
-    # Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler._init_step_index
-    def _init_step_index(self, timestep):
-        """
-        Initialize the step_index counter for the scheduler.
-        """
-
-        if self.begin_index is None:
-            if isinstance(timestep, torch.Tensor):
-                timestep = timestep.to(self.timesteps.device)
-            self._step_index = self.index_for_timestep(timestep)
-        else:
-            self._step_index = self._begin_index
-
-    def step(
-        self,
-        model_output: torch.Tensor,
-        timestep: Union[int, torch.Tensor],
-        sample: torch.Tensor,
-        generator=None,
-        return_dict: bool = True,
-    ) -> Union[SchedulerOutput, Tuple]:
-        """
-        Predict the sample from the previous timestep by reversing the SDE. This function propagates the sample with
-        the multistep DPMSolver.
-
-        Args:
-            model_output (`torch.Tensor`):
-                The direct output from learned diffusion model.
-            timestep (`int`):
-                The current discrete timestep in the diffusion chain.
-            sample (`torch.Tensor`):
-                A current instance of a sample created by the diffusion process.
-            generator (`torch.Generator`, *optional*):
-                A random number generator.
-            return_dict (`bool`):
-                Whether or not to return a [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`.
-
-        Returns:
-            [`~schedulers.scheduling_utils.SchedulerOutput`] or `tuple`:
-                If return_dict is `True`, [`~schedulers.scheduling_utils.SchedulerOutput`] is returned, otherwise a
-                tuple is returned where the first element is the sample tensor.
-
-        """
-        if self.num_inference_steps is None:
-            raise ValueError(
-                "Number of inference steps is 'None', you need to run 'set_timesteps' after creating the scheduler"
-            )
-
-        if self.step_index is None:
-            self._init_step_index(timestep)
-
-        # Improve numerical stability for small number of steps
-        lower_order_final = (self.step_index == len(self.timesteps) - 1) and (
-            self.config.euler_at_final
-            or (self.config.lower_order_final and len(self.timesteps) < 15)
-            or self.config.final_sigmas_type == "zero"
-        )
-        lower_order_second = (
-            (self.step_index == len(self.timesteps) - 2) and self.config.lower_order_final and len(self.timesteps) < 15
-        )
-
-        model_output = self.convert_model_output(model_output, sample=sample)
-        for i in range(self.config.solver_order - 1):
-            self.model_outputs[i] = self.model_outputs[i + 1]
-        self.model_outputs[-1] = model_output
-
-        if self.noise_sampler is None:
-            seed = None
-            if generator is not None:
-                seed = (
-                    [g.initial_seed() for g in generator] if isinstance(generator, list) else generator.initial_seed()
-                )
-            self.noise_sampler = BrownianTreeNoiseSampler(
-                model_output, sigma_min=self.config.sigma_min, sigma_max=self.config.sigma_max, seed=seed
-            )
-        noise = self.noise_sampler(self.sigmas[self.step_index], self.sigmas[self.step_index + 1]).to(
-            model_output.device
-        )
-
-        if self.config.solver_order == 1 or self.lower_order_nums < 1 or lower_order_final:
-            prev_sample = self.dpm_solver_first_order_update(model_output, sample=sample, noise=noise)
-        elif self.config.solver_order == 2 or self.lower_order_nums < 2 or lower_order_second:
-            prev_sample = self.multistep_dpm_solver_second_order_update(self.model_outputs, sample=sample, noise=noise)
-
-        if self.lower_order_nums < self.config.solver_order:
-            self.lower_order_nums += 1
-
-        # upon completion increase step index by one
-        self._step_index += 1
-
-        if not return_dict:
-            return (prev_sample,)
-
-        return SchedulerOutput(prev_sample=prev_sample)
-
-    # Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.add_noise
-    def add_noise(
-        self,
-        original_samples: torch.Tensor,
-        noise: torch.Tensor,
-        timesteps: torch.Tensor,
-    ) -> torch.Tensor:
-        # Make sure sigmas and timesteps have the same device and dtype as original_samples
-        sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
-        if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
-            # mps does not support float64
-            schedule_timesteps = self.timesteps.to(original_samples.device, dtype=torch.float32)
-            timesteps = timesteps.to(original_samples.device, dtype=torch.float32)
-        else:
-            schedule_timesteps = self.timesteps.to(original_samples.device)
-            timesteps = timesteps.to(original_samples.device)
-
-        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
-        if self.begin_index is None:
-            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
-        elif self.step_index is not None:
-            # add_noise is called after first denoising step (for inpainting)
-            step_indices = [self.step_index] * timesteps.shape[0]
-        else:
-            # add noise is called before first denoising step to create initial latent(img2img)
-            step_indices = [self.begin_index] * timesteps.shape[0]
-
-        sigma = sigmas[step_indices].flatten()
-        while len(sigma.shape) < len(original_samples.shape):
-            sigma = sigma.unsqueeze(-1)
-
-        noisy_samples = original_samples + noise * sigma
-        return noisy_samples
-
-    def __len__(self):
-        return self.config.num_train_timesteps
@@ -134,7 +134,7 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):

        self.timesteps = self.precondition_noise(sigmas)

-        self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])
+        self.sigmas = self.sigmas = torch.cat([sigmas, torch.zeros(1, device=sigmas.device)])

        # setable values
        self.num_inference_steps = None
@@ -93,7 +93,7 @@ from .import_utils import (
    is_xformers_available,
    requires_backends,
 )
-from .loading_utils import load_image, load_video
+from .loading_utils import load_image
 from .logging import get_logger
 from .outputs import BaseOutput
 from .peft_utils import (
@@ -62,21 +62,6 @@ class AutoencoderKLTemporalDecoder(metaclass=DummyObject):
        requires_backends(cls, ["torch"])


-class AutoencoderOobleck(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-
 class AutoencoderTiny(metaclass=DummyObject):
    _backends = ["torch"]

@@ -377,36 +362,6 @@ class SD3Transformer2DModel(metaclass=DummyObject):
        requires_backends(cls, ["torch"])


-class SparseControlNetModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-
-class StableAudioDiTModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-
 class T2IAdapter(metaclass=DummyObject):
    _backends = ["torch"]

@@ -2,21 +2,6 @@
 from ..utils import DummyObject, requires_backends


-class CosineDPMSolverMultistepScheduler(metaclass=DummyObject):
-    _backends = ["torch", "torchsde"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "torchsde"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "torchsde"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "torchsde"])
-
-
 class DPMSolverSDEScheduler(metaclass=DummyObject):
    _backends = ["torch", "torchsde"]

@@ -77,21 +77,6 @@ class AmusedPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])


-class AnimateDiffControlNetPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class AnimateDiffPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

@@ -122,21 +107,6 @@ class AnimateDiffSDXLPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])


-class AnimateDiffSparseControlNetPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class AnimateDiffVideoToVideoPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

@@ -1007,36 +977,6 @@ class ShapEPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])


-class StableAudioPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
-class StableAudioProjectionModel(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class StableCascadeCombinedPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

@@ -9,7 +9,10 @@ import numpy as np
 import PIL.Image
 import PIL.ImageOps

-from .import_utils import BACKENDS_MAPPING, is_opencv_available
+from .import_utils import (
+    BACKENDS_MAPPING,
+    is_opencv_available,
+)
 from .logging import get_logger


@@ -1,16 +1,13 @@
 import os
-import tempfile
-from typing import Callable, List, Optional, Union
+from typing import Callable, Union

 import PIL.Image
 import PIL.ImageOps
 import requests

-from .import_utils import BACKENDS_MAPPING, is_opencv_available
-

 def load_image(
-    image: Union[str, PIL.Image.Image], convert_method: Optional[Callable[[PIL.Image.Image], PIL.Image.Image]] = None
+    image: Union[str, PIL.Image.Image], convert_method: Callable[[PIL.Image.Image], PIL.Image.Image] = None
 ) -> PIL.Image.Image:
    """
    Loads `image` to a PIL Image.
@@ -18,7 +15,7 @@ def load_image(
    Args:
        image (`str` or `PIL.Image.Image`):
            The image to convert to the PIL Image format.
-        convert_method (Callable[[PIL.Image.Image], PIL.Image.Image], *optional*):
+        convert_method (Callable[[PIL.Image.Image], PIL.Image.Image], optional):
            A conversion method to apply to the image after loading it. When set to `None` the image will be converted
            "RGB".

@@ -50,73 +47,3 @@ def load_image(
        image = image.convert("RGB")

    return image
-
-
-def load_video(
-    video: str,
-    convert_method: Optional[Callable[[List[PIL.Image.Image]], List[PIL.Image.Image]]] = None,
-) -> List[PIL.Image.Image]:
-    """
-    Loads `video` to a list of PIL Image.
-
-    Args:
-        video (`str`):
-            A URL or Path to a video to convert to a list of PIL Image format.
-        convert_method (Callable[[List[PIL.Image.Image]], List[PIL.Image.Image]], *optional*):
-            A conversion method to apply to the video after loading it. When set to `None` the images will be converted
-            to "RGB".
-
-    Returns:
-        `List[PIL.Image.Image]`:
-            The video as a list of PIL images.
-    """
-    is_url = video.startswith("http://") or video.startswith("https://")
-    is_file = os.path.isfile(video)
-    was_tempfile_created = False
-
-    if not (is_url or is_file):
-        raise ValueError(
-            f"Incorrect path or URL. URLs must start with `http://` or `https://`, and {video} is not a valid path."
-        )
-
-    if is_url:
-        video_data = requests.get(video, stream=True).raw
-        video_path = tempfile.NamedTemporaryFile(suffix=os.path.splitext(video)[1], delete=False).name
-        was_tempfile_created = True
-        with open(video_path, "wb") as f:
-            f.write(video_data.read())
-
-        video = video_path
-
-    pil_images = []
-    if video.endswith(".gif"):
-        gif = PIL.Image.open(video)
-        try:
-            while True:
-                pil_images.append(gif.copy())
-                gif.seek(gif.tell() + 1)
-        except EOFError:
-            pass
-
-    else:
-        if is_opencv_available():
-            import cv2
-        else:
-            raise ImportError(BACKENDS_MAPPING["opencv"][1].format("load_video"))
-
-        video_capture = cv2.VideoCapture(video)
-        success, frame = video_capture.read()
-        while success:
-            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-            pil_images.append(PIL.Image.fromarray(frame))
-            success, frame = video_capture.read()
-
-        video_capture.release()
-
-    if was_tempfile_created:
-        os.remove(video_path)
-
-    if convert_method is not None:
-        pil_images = convert_method(pil_images)
-
-    return pil_images
@@ -18,14 +18,12 @@ import unittest

 import numpy as np
 import torch
-from datasets import load_dataset
 from parameterized import parameterized

 from diffusers import (
    AsymmetricAutoencoderKL,
    AutoencoderKL,
    AutoencoderKLTemporalDecoder,
-    AutoencoderOobleck,
    AutoencoderTiny,
    ConsistencyDecoderVAE,
    StableDiffusionPipeline,
@@ -130,18 +128,6 @@ def get_consistency_vae_config(block_out_channels=None, norm_num_groups=None):
    }


-def get_autoencoder_oobleck_config(block_out_channels=None):
-    init_dict = {
-        "encoder_hidden_size": 12,
-        "decoder_channels": 12,
-        "decoder_input_channels": 6,
-        "audio_channels": 2,
-        "downsampling_ratios": [2, 4],
-        "channel_multiples": [1, 2],
-    }
-    return init_dict
-
-
 class AutoencoderKLTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
    model_class = AutoencoderKL
    main_input_name = "sample"
@@ -494,41 +480,6 @@ class AutoencoderKLTemporalDecoderFastTests(ModelTesterMixin, unittest.TestCase)
            self.assertTrue(torch_all_close(param.grad.data, named_params_2[name].grad.data, atol=5e-5))


-class AutoencoderOobleckTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
-    model_class = AutoencoderOobleck
-    main_input_name = "sample"
-    base_precision = 1e-2
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 2
-        seq_len = 24
-
-        waveform = floats_tensor((batch_size, num_channels, seq_len)).to(torch_device)
-
-        return {"sample": waveform, "sample_posterior": False}
-
-    @property
-    def input_shape(self):
-        return (2, 24)
-
-    @property
-    def output_shape(self):
-        return (2, 24)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = get_autoencoder_oobleck_config()
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_forward_signature(self):
-        pass
-
-    def test_forward_with_norm_groups(self):
-        pass
-
-
@slow
 class AutoencoderTinyIntegrationTests(unittest.TestCase):
    def tearDown(self):
@@ -1149,116 +1100,3 @@ class ConsistencyDecoderVAEIntegrationTests(unittest.TestCase):
            for shape in shapes:
                image = torch.zeros(shape, device=torch_device, dtype=pipe.vae.dtype)
                pipe.vae.decode(image)
-
-
-@slow
-class AutoencoderOobleckIntegrationTests(unittest.TestCase):
-    def tearDown(self):
-        # clean up the VRAM after each test
-        super().tearDown()
-        gc.collect()
-        backend_empty_cache(torch_device)
-
-    def _load_datasamples(self, num_samples):
-        ds = load_dataset(
-            "hf-internal-testing/librispeech_asr_dummy", "clean", split="validation", trust_remote_code=True
-        )
-        # automatic decoding with librispeech
-        speech_samples = ds.sort("id").select(range(num_samples))[:num_samples]["audio"]
-
-        return torch.nn.utils.rnn.pad_sequence(
-            [torch.from_numpy(x["array"]) for x in speech_samples], batch_first=True
-        )
-
-    def get_audio(self, audio_sample_size=2097152, fp16=False):
-        dtype = torch.float16 if fp16 else torch.float32
-        audio = self._load_datasamples(2).to(torch_device).to(dtype)
-
-        # pad / crop to audio_sample_size
-        audio = torch.nn.functional.pad(audio[:, :audio_sample_size], pad=(0, audio_sample_size - audio.shape[-1]))
-
-        # todo channel
-        audio = audio.unsqueeze(1).repeat(1, 2, 1).to(torch_device)
-
-        return audio
-
-    def get_oobleck_vae_model(self, model_id="stabilityai/stable-audio-open-1.0", fp16=False):
-        torch_dtype = torch.float16 if fp16 else torch.float32
-
-        model = AutoencoderOobleck.from_pretrained(
-            model_id,
-            subfolder="vae",
-            torch_dtype=torch_dtype,
-        )
-        model.to(torch_device)
-
-        return model
-
-    def get_generator(self, seed=0):
-        generator_device = "cpu" if not torch_device.startswith("cuda") else "cuda"
-        if torch_device != "mps":
-            return torch.Generator(device=generator_device).manual_seed(seed)
-        return torch.manual_seed(seed)
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [33, [1.193e-4, 6.56e-05, 1.314e-4, 3.80e-05, -4.01e-06], 0.001192],
-            [44, [2.77e-05, -2.65e-05, 1.18e-05, -6.94e-05, -9.57e-05], 0.001196],
-            # fmt: on
-        ]
-    )
-    def test_stable_diffusion(self, seed, expected_slice, expected_mean_absolute_diff):
-        model = self.get_oobleck_vae_model()
-        audio = self.get_audio()
-        generator = self.get_generator(seed)
-
-        with torch.no_grad():
-            sample = model(audio, generator=generator, sample_posterior=True).sample
-
-        assert sample.shape == audio.shape
-        assert ((sample - audio).abs().mean() - expected_mean_absolute_diff).abs() <= 1e-6
-
-        output_slice = sample[-1, 1, 5:10].cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-5)
-
-    def test_stable_diffusion_mode(self):
-        model = self.get_oobleck_vae_model()
-        audio = self.get_audio()
-
-        with torch.no_grad():
-            sample = model(audio, sample_posterior=False).sample
-
-        assert sample.shape == audio.shape
-
-    @parameterized.expand(
-        [
-            # fmt: off
-            [33, [1.193e-4, 6.56e-05, 1.314e-4, 3.80e-05, -4.01e-06], 0.001192],
-            [44, [2.77e-05, -2.65e-05, 1.18e-05, -6.94e-05, -9.57e-05], 0.001196],
-            # fmt: on
-        ]
-    )
-    def test_stable_diffusion_encode_decode(self, seed, expected_slice, expected_mean_absolute_diff):
-        model = self.get_oobleck_vae_model()
-        audio = self.get_audio()
-        generator = self.get_generator(seed)
-
-        with torch.no_grad():
-            x = audio
-            posterior = model.encode(x).latent_dist
-            z = posterior.sample(generator=generator)
-            sample = model.decode(z).sample
-
-        # (batch_size, latent_dim, sequence_length)
-        assert posterior.mean.shape == (audio.shape[0], model.config.decoder_input_channels, 1024)
-
-        assert sample.shape == audio.shape
-        assert ((sample - audio).abs().mean() - expected_mean_absolute_diff).abs() <= 1e-6
-
-        output_slice = sample[-1, 1, 5:10].cpu()
-        expected_output_slice = torch.tensor(expected_slice)
-
-        assert torch_all_close(output_slice, expected_output_slice, atol=1e-5)
@@ -10,8 +10,6 @@ from diffusers import (
    AnimateDiffPipeline,
    AutoencoderKL,
    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    LCMScheduler,
    MotionAdapter,
    StableDiffusionPipeline,
    UNet2DConditionModel,
@@ -355,52 +353,6 @@ class AnimateDiffPipelineFastTests(
            "Disabling of FreeInit should lead to results similar to the default pipeline results",
        )

-    def test_free_init_with_schedulers(self):
-        components = self.get_dummy_components()
-        pipe: AnimateDiffPipeline = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.to(torch_device)
-
-        inputs_normal = self.get_dummy_inputs(torch_device)
-        frames_normal = pipe(**inputs_normal).frames[0]
-
-        schedulers_to_test = [
-            DPMSolverMultistepScheduler.from_config(
-                components["scheduler"].config,
-                timestep_spacing="linspace",
-                beta_schedule="linear",
-                algorithm_type="dpmsolver++",
-                steps_offset=1,
-                clip_sample=False,
-            ),
-            LCMScheduler.from_config(
-                components["scheduler"].config,
-                timestep_spacing="linspace",
-                beta_schedule="linear",
-                steps_offset=1,
-                clip_sample=False,
-            ),
-        ]
-        components.pop("scheduler")
-
-        for scheduler in schedulers_to_test:
-            components["scheduler"] = scheduler
-            pipe: AnimateDiffPipeline = self.pipeline_class(**components)
-            pipe.set_progress_bar_config(disable=None)
-            pipe.to(torch_device)
-
-            pipe.enable_free_init(num_iters=2, use_fast_sampling=False)
-
-            inputs = self.get_dummy_inputs(torch_device)
-            frames_enable_free_init = pipe(**inputs).frames[0]
-            sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
-
-            self.assertGreater(
-                sum_enabled,
-                1e1,
-                "Enabling of FreeInit should lead to results different from the default pipeline results",
-            )
-
    @unittest.skipIf(
        torch_device != "cuda" or not is_xformers_available(),
        reason="XFormers attention is only available with CUDA and `xformers` installed",
@@ -1,431 +0,0 @@
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-import diffusers
-from diffusers import (
-    AnimateDiffControlNetPipeline,
-    AutoencoderKL,
-    ControlNetModel,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    LCMScheduler,
-    MotionAdapter,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-    UNetMotionModel,
-)
-from diffusers.utils import logging
-from diffusers.utils.testing_utils import torch_device
-
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import (
-    IPAdapterTesterMixin,
-    PipelineFromPipeTesterMixin,
-    PipelineTesterMixin,
-    SDFunctionTesterMixin,
-)
-
-
-def to_np(tensor):
-    if isinstance(tensor, torch.Tensor):
-        tensor = tensor.detach().cpu().numpy()
-
-    return tensor
-
-
-class AnimateDiffControlNetPipelineFastTests(
-    IPAdapterTesterMixin, SDFunctionTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase
-):
-    pipeline_class = AnimateDiffControlNetPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"conditioning_frames"})
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "generator",
-            "latents",
-            "return_dict",
-            "callback_on_step_end",
-            "callback_on_step_end_tensor_inputs",
-        ]
-    )
-
-    def get_dummy_components(self):
-        cross_attention_dim = 8
-        block_out_channels = (8, 8)
-
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=block_out_channels,
-            layers_per_block=2,
-            sample_size=8,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=cross_attention_dim,
-            norm_num_groups=2,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="linear",
-            clip_sample=False,
-        )
-        torch.manual_seed(0)
-        controlnet = ControlNetModel(
-            block_out_channels=block_out_channels,
-            layers_per_block=2,
-            in_channels=4,
-            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
-            cross_attention_dim=cross_attention_dim,
-            conditioning_embedding_out_channels=(8, 8),
-            norm_num_groups=1,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=block_out_channels,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            norm_num_groups=2,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=cross_attention_dim,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        motion_adapter = MotionAdapter(
-            block_out_channels=block_out_channels,
-            motion_layers_per_block=2,
-            motion_norm_num_groups=2,
-            motion_num_attention_heads=4,
-        )
-
-        components = {
-            "unet": unet,
-            "controlnet": controlnet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "motion_adapter": motion_adapter,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "feature_extractor": None,
-            "image_encoder": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed: int = 0, num_frames: int = 2):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-
-        video_height = 32
-        video_width = 32
-        conditioning_frames = [Image.new("RGB", (video_width, video_height))] * num_frames
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "conditioning_frames": conditioning_frames,
-            "generator": generator,
-            "num_inference_steps": 2,
-            "num_frames": num_frames,
-            "guidance_scale": 7.5,
-            "output_type": "pt",
-        }
-        return inputs
-
-    def test_from_pipe_consistent_config(self):
-        assert self.original_pipeline_class == StableDiffusionPipeline
-        original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
-        original_kwargs = {"requires_safety_checker": False}
-
-        # create original_pipeline_class(sd)
-        pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
-
-        # original_pipeline_class(sd) -> pipeline_class
-        pipe_components = self.get_dummy_components()
-        pipe_additional_components = {}
-        for name, component in pipe_components.items():
-            if name not in pipe_original.components:
-                pipe_additional_components[name] = component
-
-        pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
-
-        # pipeline_class -> original_pipeline_class(sd)
-        original_pipe_additional_components = {}
-        for name, component in pipe_original.components.items():
-            if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
-                original_pipe_additional_components[name] = component
-
-        pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
-
-        # compare the config
-        original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
-        original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
-        assert original_config_2 == original_config
-
-    def test_motion_unet_loading(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-
-        assert isinstance(pipe.unet, UNetMotionModel)
-
-    @unittest.skip("Attention slicing is not enabled in this pipeline")
-    def test_attention_slicing_forward_pass(self):
-        pass
-
-    def test_ip_adapter_single(self):
-        expected_pipe_slice = None
-        if torch_device == "cpu":
-            expected_pipe_slice = np.array(
-                [
-                    0.6604,
-                    0.4099,
-                    0.4928,
-                    0.5706,
-                    0.5096,
-                    0.5012,
-                    0.6051,
-                    0.5169,
-                    0.5021,
-                    0.4864,
-                    0.4261,
-                    0.5779,
-                    0.5822,
-                    0.4049,
-                    0.5253,
-                    0.6160,
-                    0.4150,
-                    0.5155,
-                ]
-            )
-        return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
-
-    def test_dict_tuple_outputs_equivalent(self):
-        expected_slice = None
-        if torch_device == "cpu":
-            expected_slice = np.array([0.6051, 0.5169, 0.5021, 0.6160, 0.4150, 0.5155])
-        return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
-
-    def test_inference_batch_single_identical(
-        self,
-        batch_size=2,
-        expected_max_diff=1e-4,
-        additional_params_copy_to_batched_inputs=["num_inference_steps"],
-    ):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        for components in pipe.components.values():
-            if hasattr(components, "set_default_attn_processor"):
-                components.set_default_attn_processor()
-
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(torch_device)
-        # Reset generator in case it is has been used in self.get_dummy_inputs
-        inputs["generator"] = self.get_generator(0)
-
-        logger = logging.get_logger(pipe.__module__)
-        logger.setLevel(level=diffusers.logging.FATAL)
-
-        # batchify inputs
-        batched_inputs = {}
-        batched_inputs.update(inputs)
-
-        for name in self.batch_params:
-            if name not in inputs:
-                continue
-
-            value = inputs[name]
-            if name == "prompt":
-                len_prompt = len(value)
-                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
-                batched_inputs[name][-1] = 100 * "very long"
-
-            else:
-                batched_inputs[name] = batch_size * [value]
-
-        if "generator" in inputs:
-            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
-
-        if "batch_size" in inputs:
-            batched_inputs["batch_size"] = batch_size
-
-        for arg in additional_params_copy_to_batched_inputs:
-            batched_inputs[arg] = inputs[arg]
-
-        output = pipe(**inputs)
-        output_batch = pipe(**batched_inputs)
-
-        assert output_batch[0].shape[0] == batch_size
-
-        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
-        assert max_diff < expected_max_diff
-
-    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
-    def test_to_device(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-
-        pipe.to("cpu")
-        # pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components
-        model_devices = [
-            component.device.type for component in pipe.components.values() if hasattr(component, "device")
-        ]
-        self.assertTrue(all(device == "cpu" for device in model_devices))
-
-        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
-        self.assertTrue(np.isnan(output_cpu).sum() == 0)
-
-        pipe.to("cuda")
-        model_devices = [
-            component.device.type for component in pipe.components.values() if hasattr(component, "device")
-        ]
-        self.assertTrue(all(device == "cuda" for device in model_devices))
-
-        output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
-        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
-
-    def test_to_dtype(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-
-        # pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components
-        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
-        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
-
-        pipe.to(dtype=torch.float16)
-        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
-        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
-
-    def test_prompt_embeds(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.to(torch_device)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs.pop("prompt")
-        inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
-        pipe(**inputs)
-
-    def test_free_init(self):
-        components = self.get_dummy_components()
-        pipe: AnimateDiffControlNetPipeline = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.to(torch_device)
-
-        inputs_normal = self.get_dummy_inputs(torch_device)
-        frames_normal = pipe(**inputs_normal).frames[0]
-
-        pipe.enable_free_init(
-            num_iters=2,
-            use_fast_sampling=True,
-            method="butterworth",
-            order=4,
-            spatial_stop_frequency=0.25,
-            temporal_stop_frequency=0.25,
-        )
-        inputs_enable_free_init = self.get_dummy_inputs(torch_device)
-        frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0]
-
-        pipe.disable_free_init()
-        inputs_disable_free_init = self.get_dummy_inputs(torch_device)
-        frames_disable_free_init = pipe(**inputs_disable_free_init).frames[0]
-
-        sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
-        max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_init)).max()
-        self.assertGreater(
-            sum_enabled, 1e1, "Enabling of FreeInit should lead to results different from the default pipeline results"
-        )
-        self.assertLess(
-            max_diff_disabled,
-            1e-4,
-            "Disabling of FreeInit should lead to results similar to the default pipeline results",
-        )
-
-    def test_free_init_with_schedulers(self):
-        components = self.get_dummy_components()
-        pipe: AnimateDiffControlNetPipeline = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.to(torch_device)
-
-        inputs_normal = self.get_dummy_inputs(torch_device)
-        frames_normal = pipe(**inputs_normal).frames[0]
-
-        schedulers_to_test = [
-            DPMSolverMultistepScheduler.from_config(
-                components["scheduler"].config,
-                timestep_spacing="linspace",
-                beta_schedule="linear",
-                algorithm_type="dpmsolver++",
-                steps_offset=1,
-                clip_sample=False,
-            ),
-            LCMScheduler.from_config(
-                components["scheduler"].config,
-                timestep_spacing="linspace",
-                beta_schedule="linear",
-                steps_offset=1,
-                clip_sample=False,
-            ),
-        ]
-        components.pop("scheduler")
-
-        for scheduler in schedulers_to_test:
-            components["scheduler"] = scheduler
-            pipe: AnimateDiffControlNetPipeline = self.pipeline_class(**components)
-            pipe.set_progress_bar_config(disable=None)
-            pipe.to(torch_device)
-
-            pipe.enable_free_init(num_iters=2, use_fast_sampling=False)
-
-            inputs = self.get_dummy_inputs(torch_device)
-            frames_enable_free_init = pipe(**inputs).frames[0]
-            sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
-
-            self.assertGreater(
-                sum_enabled,
-                1e1,
-                "Enabling of FreeInit should lead to results different from the default pipeline results",
-            )
-
-    def test_vae_slicing(self, video_count=2):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        inputs["prompt"] = [inputs["prompt"]] * video_count
-        inputs["conditioning_frames"] = [inputs["conditioning_frames"]] * video_count
-        output_1 = pipe(**inputs)
-
-        # make sure sliced vae decode yields the same result
-        pipe.enable_vae_slicing()
-        inputs = self.get_dummy_inputs(device)
-        inputs["prompt"] = [inputs["prompt"]] * video_count
-        inputs["conditioning_frames"] = [inputs["conditioning_frames"]] * video_count
-        output_2 = pipe(**inputs)
-
-        assert np.abs(output_2[0].flatten() - output_1[0].flatten()).max() < 1e-2
@@ -1,478 +0,0 @@
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer
-
-import diffusers
-from diffusers import (
-    AnimateDiffSparseControlNetPipeline,
-    AutoencoderKL,
-    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    LCMScheduler,
-    MotionAdapter,
-    SparseControlNetModel,
-    StableDiffusionPipeline,
-    UNet2DConditionModel,
-    UNetMotionModel,
-)
-from diffusers.utils import logging
-from diffusers.utils.testing_utils import torch_device
-
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import (
-    IPAdapterTesterMixin,
-    PipelineFromPipeTesterMixin,
-    PipelineTesterMixin,
-    SDFunctionTesterMixin,
-)
-
-
-def to_np(tensor):
-    if isinstance(tensor, torch.Tensor):
-        tensor = tensor.detach().cpu().numpy()
-
-    return tensor
-
-
-class AnimateDiffSparseControlNetPipelineFastTests(
-    IPAdapterTesterMixin, SDFunctionTesterMixin, PipelineTesterMixin, PipelineFromPipeTesterMixin, unittest.TestCase
-):
-    pipeline_class = AnimateDiffSparseControlNetPipeline
-    params = TEXT_TO_IMAGE_PARAMS
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "generator",
-            "latents",
-            "return_dict",
-            "callback_on_step_end",
-            "callback_on_step_end_tensor_inputs",
-        ]
-    )
-
-    def get_dummy_components(self):
-        cross_attention_dim = 8
-        block_out_channels = (8, 8)
-
-        torch.manual_seed(0)
-        unet = UNet2DConditionModel(
-            block_out_channels=block_out_channels,
-            layers_per_block=2,
-            sample_size=8,
-            in_channels=4,
-            out_channels=4,
-            down_block_types=("CrossAttnDownBlock2D", "DownBlock2D"),
-            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
-            cross_attention_dim=cross_attention_dim,
-            norm_num_groups=2,
-        )
-        scheduler = DDIMScheduler(
-            beta_start=0.00085,
-            beta_end=0.012,
-            beta_schedule="linear",
-            clip_sample=False,
-        )
-        torch.manual_seed(0)
-        controlnet = SparseControlNetModel(
-            block_out_channels=block_out_channels,
-            layers_per_block=2,
-            in_channels=4,
-            conditioning_channels=3,
-            down_block_types=("CrossAttnDownBlockMotion", "DownBlockMotion"),
-            cross_attention_dim=cross_attention_dim,
-            conditioning_embedding_out_channels=(8, 8),
-            norm_num_groups=1,
-            use_simplified_condition_embedding=False,
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderKL(
-            block_out_channels=block_out_channels,
-            in_channels=3,
-            out_channels=3,
-            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
-            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
-            latent_channels=4,
-            norm_num_groups=2,
-        )
-        torch.manual_seed(0)
-        text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=cross_attention_dim,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=5,
-            pad_token_id=1,
-            vocab_size=1000,
-        )
-        text_encoder = CLIPTextModel(text_encoder_config)
-        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-        motion_adapter = MotionAdapter(
-            block_out_channels=block_out_channels,
-            motion_layers_per_block=2,
-            motion_norm_num_groups=2,
-            motion_num_attention_heads=4,
-        )
-
-        components = {
-            "unet": unet,
-            "controlnet": controlnet,
-            "scheduler": scheduler,
-            "vae": vae,
-            "motion_adapter": motion_adapter,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "feature_extractor": None,
-            "image_encoder": None,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed: int = 0, num_frames: int = 2):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-
-        video_height = 32
-        video_width = 32
-        conditioning_frames = [Image.new("RGB", (video_width, video_height))] * num_frames
-
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "conditioning_frames": conditioning_frames,
-            "controlnet_frame_indices": list(range(num_frames)),
-            "generator": generator,
-            "num_inference_steps": 2,
-            "num_frames": num_frames,
-            "guidance_scale": 7.5,
-            "output_type": "pt",
-        }
-        return inputs
-
-    def test_from_pipe_consistent_config(self):
-        assert self.original_pipeline_class == StableDiffusionPipeline
-        original_repo = "hf-internal-testing/tinier-stable-diffusion-pipe"
-        original_kwargs = {"requires_safety_checker": False}
-
-        # create original_pipeline_class(sd)
-        pipe_original = self.original_pipeline_class.from_pretrained(original_repo, **original_kwargs)
-
-        # original_pipeline_class(sd) -> pipeline_class
-        pipe_components = self.get_dummy_components()
-        pipe_additional_components = {}
-        for name, component in pipe_components.items():
-            if name not in pipe_original.components:
-                pipe_additional_components[name] = component
-
-        pipe = self.pipeline_class.from_pipe(pipe_original, **pipe_additional_components)
-
-        # pipeline_class -> original_pipeline_class(sd)
-        original_pipe_additional_components = {}
-        for name, component in pipe_original.components.items():
-            if name not in pipe.components or not isinstance(component, pipe.components[name].__class__):
-                original_pipe_additional_components[name] = component
-
-        pipe_original_2 = self.original_pipeline_class.from_pipe(pipe, **original_pipe_additional_components)
-
-        # compare the config
-        original_config = {k: v for k, v in pipe_original.config.items() if not k.startswith("_")}
-        original_config_2 = {k: v for k, v in pipe_original_2.config.items() if not k.startswith("_")}
-        assert original_config_2 == original_config
-
-    def test_motion_unet_loading(self):
-        components = self.get_dummy_components()
-        pipe = AnimateDiffSparseControlNetPipeline(**components)
-
-        assert isinstance(pipe.unet, UNetMotionModel)
-
-    @unittest.skip("Attention slicing is not enabled in this pipeline")
-    def test_attention_slicing_forward_pass(self):
-        pass
-
-    def test_ip_adapter_single(self):
-        expected_pipe_slice = None
-        if torch_device == "cpu":
-            expected_pipe_slice = np.array(
-                [
-                    0.6604,
-                    0.4099,
-                    0.4928,
-                    0.5706,
-                    0.5096,
-                    0.5012,
-                    0.6051,
-                    0.5169,
-                    0.5021,
-                    0.4864,
-                    0.4261,
-                    0.5779,
-                    0.5822,
-                    0.4049,
-                    0.5253,
-                    0.6160,
-                    0.4150,
-                    0.5155,
-                ]
-            )
-        return super().test_ip_adapter_single(expected_pipe_slice=expected_pipe_slice)
-
-    def test_dict_tuple_outputs_equivalent(self):
-        expected_slice = None
-        if torch_device == "cpu":
-            expected_slice = np.array([0.6051, 0.5169, 0.5021, 0.6160, 0.4150, 0.5155])
-        return super().test_dict_tuple_outputs_equivalent(expected_slice=expected_slice)
-
-    def test_inference_batch_single_identical(
-        self,
-        batch_size=2,
-        expected_max_diff=1e-4,
-        additional_params_copy_to_batched_inputs=["num_inference_steps"],
-    ):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        for components in pipe.components.values():
-            if hasattr(components, "set_default_attn_processor"):
-                components.set_default_attn_processor()
-
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(torch_device)
-        # Reset generator in case it is has been used in self.get_dummy_inputs
-        inputs["generator"] = self.get_generator(0)
-
-        logger = logging.get_logger(pipe.__module__)
-        logger.setLevel(level=diffusers.logging.FATAL)
-
-        # batchify inputs
-        batched_inputs = {}
-        batched_inputs.update(inputs)
-
-        for name in self.batch_params:
-            if name not in inputs:
-                continue
-
-            value = inputs[name]
-            if name == "prompt":
-                len_prompt = len(value)
-                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
-                batched_inputs[name][-1] = 100 * "very long"
-
-            else:
-                batched_inputs[name] = batch_size * [value]
-
-        if "generator" in inputs:
-            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
-
-        if "batch_size" in inputs:
-            batched_inputs["batch_size"] = batch_size
-
-        for arg in additional_params_copy_to_batched_inputs:
-            batched_inputs[arg] = inputs[arg]
-
-        output = pipe(**inputs)
-        output_batch = pipe(**batched_inputs)
-
-        assert output_batch[0].shape[0] == batch_size
-
-        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
-        assert max_diff < expected_max_diff
-
-    def test_inference_batch_single_identical_use_simplified_condition_embedding_true(
-        self,
-        batch_size=2,
-        expected_max_diff=1e-4,
-        additional_params_copy_to_batched_inputs=["num_inference_steps"],
-    ):
-        components = self.get_dummy_components()
-
-        torch.manual_seed(0)
-        old_controlnet = components.pop("controlnet")
-        components["controlnet"] = SparseControlNetModel.from_config(
-            old_controlnet.config, conditioning_channels=4, use_simplified_condition_embedding=True
-        )
-
-        pipe = self.pipeline_class(**components)
-        for components in pipe.components.values():
-            if hasattr(components, "set_default_attn_processor"):
-                components.set_default_attn_processor()
-
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        inputs = self.get_dummy_inputs(torch_device)
-        # Reset generator in case it is has been used in self.get_dummy_inputs
-        inputs["generator"] = self.get_generator(0)
-
-        logger = logging.get_logger(pipe.__module__)
-        logger.setLevel(level=diffusers.logging.FATAL)
-
-        # batchify inputs
-        batched_inputs = {}
-        batched_inputs.update(inputs)
-
-        for name in self.batch_params:
-            if name not in inputs:
-                continue
-
-            value = inputs[name]
-            if name == "prompt":
-                len_prompt = len(value)
-                batched_inputs[name] = [value[: len_prompt // i] for i in range(1, batch_size + 1)]
-                batched_inputs[name][-1] = 100 * "very long"
-
-            else:
-                batched_inputs[name] = batch_size * [value]
-
-        if "generator" in inputs:
-            batched_inputs["generator"] = [self.get_generator(i) for i in range(batch_size)]
-
-        if "batch_size" in inputs:
-            batched_inputs["batch_size"] = batch_size
-
-        for arg in additional_params_copy_to_batched_inputs:
-            batched_inputs[arg] = inputs[arg]
-
-        output = pipe(**inputs)
-        output_batch = pipe(**batched_inputs)
-
-        assert output_batch[0].shape[0] == batch_size
-
-        max_diff = np.abs(to_np(output_batch[0][0]) - to_np(output[0][0])).max()
-        assert max_diff < expected_max_diff
-
-    @unittest.skipIf(torch_device != "cuda", reason="CUDA and CPU are required to switch devices")
-    def test_to_device(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-
-        pipe.to("cpu")
-        # pipeline creates a new motion UNet under the hood. So we need to check the device from pipe.components
-        model_devices = [
-            component.device.type for component in pipe.components.values() if hasattr(component, "device")
-        ]
-        self.assertTrue(all(device == "cpu" for device in model_devices))
-
-        output_cpu = pipe(**self.get_dummy_inputs("cpu"))[0]
-        self.assertTrue(np.isnan(output_cpu).sum() == 0)
-
-        pipe.to("cuda")
-        model_devices = [
-            component.device.type for component in pipe.components.values() if hasattr(component, "device")
-        ]
-        self.assertTrue(all(device == "cuda" for device in model_devices))
-
-        output_cuda = pipe(**self.get_dummy_inputs("cuda"))[0]
-        self.assertTrue(np.isnan(to_np(output_cuda)).sum() == 0)
-
-    def test_to_dtype(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-
-        # pipeline creates a new motion UNet under the hood. So we need to check the dtype from pipe.components
-        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
-        self.assertTrue(all(dtype == torch.float32 for dtype in model_dtypes))
-
-        pipe.to(dtype=torch.float16)
-        model_dtypes = [component.dtype for component in pipe.components.values() if hasattr(component, "dtype")]
-        self.assertTrue(all(dtype == torch.float16 for dtype in model_dtypes))
-
-    def test_prompt_embeds(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.to(torch_device)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs.pop("prompt")
-        inputs["prompt_embeds"] = torch.randn((1, 4, pipe.text_encoder.config.hidden_size), device=torch_device)
-        pipe(**inputs)
-
-    def test_free_init(self):
-        components = self.get_dummy_components()
-        pipe: AnimateDiffSparseControlNetPipeline = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.to(torch_device)
-
-        inputs_normal = self.get_dummy_inputs(torch_device)
-        frames_normal = pipe(**inputs_normal).frames[0]
-
-        pipe.enable_free_init(
-            num_iters=2,
-            use_fast_sampling=True,
-            method="butterworth",
-            order=4,
-            spatial_stop_frequency=0.25,
-            temporal_stop_frequency=0.25,
-        )
-        inputs_enable_free_init = self.get_dummy_inputs(torch_device)
-        frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0]
-
-        pipe.disable_free_init()
-        inputs_disable_free_init = self.get_dummy_inputs(torch_device)
-        frames_disable_free_init = pipe(**inputs_disable_free_init).frames[0]
-
-        sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
-        max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_init)).max()
-        self.assertGreater(
-            sum_enabled, 1e1, "Enabling of FreeInit should lead to results different from the default pipeline results"
-        )
-        self.assertLess(
-            max_diff_disabled,
-            1e-4,
-            "Disabling of FreeInit should lead to results similar to the default pipeline results",
-        )
-
-    def test_free_init_with_schedulers(self):
-        components = self.get_dummy_components()
-        pipe: AnimateDiffSparseControlNetPipeline = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.to(torch_device)
-
-        inputs_normal = self.get_dummy_inputs(torch_device)
-        frames_normal = pipe(**inputs_normal).frames[0]
-
-        schedulers_to_test = [
-            DPMSolverMultistepScheduler.from_config(
-                components["scheduler"].config,
-                timestep_spacing="linspace",
-                beta_schedule="linear",
-                algorithm_type="dpmsolver++",
-                steps_offset=1,
-                clip_sample=False,
-            ),
-            LCMScheduler.from_config(
-                components["scheduler"].config,
-                timestep_spacing="linspace",
-                beta_schedule="linear",
-                steps_offset=1,
-                clip_sample=False,
-            ),
-        ]
-        components.pop("scheduler")
-
-        for scheduler in schedulers_to_test:
-            components["scheduler"] = scheduler
-            pipe: AnimateDiffSparseControlNetPipeline = self.pipeline_class(**components)
-            pipe.set_progress_bar_config(disable=None)
-            pipe.to(torch_device)
-
-            pipe.enable_free_init(num_iters=2, use_fast_sampling=False)
-
-            inputs = self.get_dummy_inputs(torch_device)
-            frames_enable_free_init = pipe(**inputs).frames[0]
-            sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
-
-            self.assertGreater(
-                sum_enabled,
-                1e1,
-                "Enabling of FreeInit should lead to results different from the default pipeline results",
-            )
-
-    def test_vae_slicing(self):
-        return super().test_vae_slicing(image_count=2)
@@ -10,8 +10,6 @@ from diffusers import (
    AnimateDiffVideoToVideoPipeline,
    AutoencoderKL,
    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    LCMScheduler,
    MotionAdapter,
    StableDiffusionPipeline,
    UNet2DConditionModel,
@@ -382,49 +380,3 @@ class AnimateDiffVideoToVideoPipelineFastTests(
            1e-4,
            "Disabling of FreeInit should lead to results similar to the default pipeline results",
        )
-
-    def test_free_init_with_schedulers(self):
-        components = self.get_dummy_components()
-        pipe: AnimateDiffVideoToVideoPipeline = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.to(torch_device)
-
-        inputs_normal = self.get_dummy_inputs(torch_device)
-        frames_normal = pipe(**inputs_normal).frames[0]
-
-        schedulers_to_test = [
-            DPMSolverMultistepScheduler.from_config(
-                components["scheduler"].config,
-                timestep_spacing="linspace",
-                beta_schedule="linear",
-                algorithm_type="dpmsolver++",
-                steps_offset=1,
-                clip_sample=False,
-            ),
-            LCMScheduler.from_config(
-                components["scheduler"].config,
-                timestep_spacing="linspace",
-                beta_schedule="linear",
-                steps_offset=1,
-                clip_sample=False,
-            ),
-        ]
-        components.pop("scheduler")
-
-        for scheduler in schedulers_to_test:
-            components["scheduler"] = scheduler
-            pipe: AnimateDiffVideoToVideoPipeline = self.pipeline_class(**components)
-            pipe.set_progress_bar_config(disable=None)
-            pipe.to(torch_device)
-
-            pipe.enable_free_init(num_iters=2, use_fast_sampling=False)
-
-            inputs = self.get_dummy_inputs(torch_device)
-            frames_enable_free_init = pipe(**inputs).frames[0]
-            sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
-
-            self.assertGreater(
-                sum_enabled,
-                1e1,
-                "Enabling of FreeInit should lead to results different from the default pipeline results",
-            )
@@ -96,8 +96,6 @@ class KolorsPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
            "vae": vae,
            "text_encoder": text_encoder,
            "tokenizer": tokenizer,
-            "image_encoder": None,
-            "feature_extractor": None,
        }
        return components

@@ -134,10 +132,8 @@ class KolorsPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
        self.assertLessEqual(max_diff, 1e-3)

-    # throws AttributeError: property 'eos_token' of 'ChatGLMTokenizer' object has no setter
-    # not sure if it is worth to fix it before integrating it to transformers
+    # should skip it but pipe._optional_components = [] so it doesn't
    def test_save_load_optional_components(self):
-        # TODO (Alvaro) need to fix later
        pass

    # throws AttributeError: property 'eos_token' of 'ChatGLMTokenizer' object has no setter
@@ -9,8 +9,6 @@ import diffusers
 from diffusers import (
    AutoencoderKL,
    DDIMScheduler,
-    DPMSolverMultistepScheduler,
-    LCMScheduler,
    MotionAdapter,
    PIAPipeline,
    StableDiffusionPipeline,
@@ -362,52 +360,6 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, PipelineFr
            "Disabling of FreeInit should lead to results similar to the default pipeline results",
        )

-    def test_free_init_with_schedulers(self):
-        components = self.get_dummy_components()
-        pipe: PIAPipeline = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.to(torch_device)
-
-        inputs_normal = self.get_dummy_inputs(torch_device)
-        frames_normal = pipe(**inputs_normal).frames[0]
-
-        schedulers_to_test = [
-            DPMSolverMultistepScheduler.from_config(
-                components["scheduler"].config,
-                timestep_spacing="linspace",
-                beta_schedule="linear",
-                algorithm_type="dpmsolver++",
-                steps_offset=1,
-                clip_sample=False,
-            ),
-            LCMScheduler.from_config(
-                components["scheduler"].config,
-                timestep_spacing="linspace",
-                beta_schedule="linear",
-                steps_offset=1,
-                clip_sample=False,
-            ),
-        ]
-        components.pop("scheduler")
-
-        for scheduler in schedulers_to_test:
-            components["scheduler"] = scheduler
-            pipe: PIAPipeline = self.pipeline_class(**components)
-            pipe.set_progress_bar_config(disable=None)
-            pipe.to(torch_device)
-
-            pipe.enable_free_init(num_iters=2, use_fast_sampling=False)
-
-            inputs = self.get_dummy_inputs(torch_device)
-            frames_enable_free_init = pipe(**inputs).frames[0]
-            sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
-
-            self.assertGreater(
-                sum_enabled,
-                1e1,
-                "Enabling of FreeInit should lead to results different from the default pipeline results",
-            )
-
    @unittest.skipIf(
        torch_device != "cuda" or not is_xformers_available(),
        reason="XFormers attention is only available with CUDA and `xformers` installed",
@@ -1,460 +0,0 @@
-# coding=utf-8
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import gc
-import unittest
-
-import numpy as np
-import torch
-from transformers import (
-    T5EncoderModel,
-    T5Tokenizer,
-)
-
-from diffusers import (
-    AutoencoderOobleck,
-    CosineDPMSolverMultistepScheduler,
-    StableAudioDiTModel,
-    StableAudioPipeline,
-    StableAudioProjectionModel,
-)
-from diffusers.utils import is_xformers_available
-from diffusers.utils.testing_utils import enable_full_determinism, nightly, require_torch_gpu, torch_device
-
-from ..pipeline_params import TEXT_TO_AUDIO_BATCH_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin
-
-
-enable_full_determinism()
-
-
-class StableAudioPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = StableAudioPipeline
-    params = frozenset(
-        [
-            "prompt",
-            "audio_end_in_s",
-            "audio_start_in_s",
-            "guidance_scale",
-            "negative_prompt",
-            "prompt_embeds",
-            "negative_prompt_embeds",
-            "initial_audio_waveforms",
-        ]
-    )
-    batch_params = TEXT_TO_AUDIO_BATCH_PARAMS
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "num_waveforms_per_prompt",
-            "generator",
-            "latents",
-            "output_type",
-            "return_dict",
-            "callback",
-            "callback_steps",
-        ]
-    )
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        transformer = StableAudioDiTModel(
-            sample_size=4,
-            in_channels=3,
-            num_layers=2,
-            attention_head_dim=4,
-            num_key_value_attention_heads=2,
-            out_channels=3,
-            cross_attention_dim=4,
-            time_proj_dim=8,
-            global_states_input_dim=8,
-            cross_attention_input_dim=4,
-        )
-        scheduler = CosineDPMSolverMultistepScheduler(
-            solver_order=2,
-            prediction_type="v_prediction",
-            sigma_data=1.0,
-            sigma_schedule="exponential",
-        )
-        torch.manual_seed(0)
-        vae = AutoencoderOobleck(
-            encoder_hidden_size=6,
-            downsampling_ratios=[1, 2],
-            decoder_channels=3,
-            decoder_input_channels=3,
-            audio_channels=2,
-            channel_multiples=[2, 4],
-            sampling_rate=4,
-        )
-        torch.manual_seed(0)
-        t5_repo_id = "hf-internal-testing/tiny-random-T5ForConditionalGeneration"
-        text_encoder = T5EncoderModel.from_pretrained(t5_repo_id)
-        tokenizer = T5Tokenizer.from_pretrained(t5_repo_id, truncation=True, model_max_length=25)
-
-        torch.manual_seed(0)
-        projection_model = StableAudioProjectionModel(
-            text_encoder_dim=text_encoder.config.d_model,
-            conditioning_dim=4,
-            min_value=0,
-            max_value=32,
-        )
-
-        components = {
-            "transformer": transformer,
-            "scheduler": scheduler,
-            "vae": vae,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            "projection_model": projection_model,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A hammer hitting a wooden surface",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 6.0,
-        }
-        return inputs
-
-    def test_save_load_local(self):
-        # increase tolerance from 1e-4 -> 7e-3 to account for large composite model
-        super().test_save_load_local(expected_max_difference=7e-3)
-
-    def test_save_load_optional_components(self):
-        # increase tolerance from 1e-4 -> 7e-3 to account for large composite model
-        super().test_save_load_optional_components(expected_max_difference=7e-3)
-
-    def test_stable_audio_ddim(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-
-        components = self.get_dummy_components()
-        stable_audio_pipe = StableAudioPipeline(**components)
-        stable_audio_pipe = stable_audio_pipe.to(torch_device)
-        stable_audio_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = stable_audio_pipe(**inputs)
-        audio = output.audios[0]
-
-        assert audio.ndim == 2
-        assert audio.shape == (2, 7)
-
-    def test_stable_audio_without_prompts(self):
-        components = self.get_dummy_components()
-        stable_audio_pipe = StableAudioPipeline(**components)
-        stable_audio_pipe = stable_audio_pipe.to(torch_device)
-        stable_audio_pipe = stable_audio_pipe.to(torch_device)
-        stable_audio_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-
-        # forward
-        output = stable_audio_pipe(**inputs)
-        audio_1 = output.audios[0]
-
-        inputs = self.get_dummy_inputs(torch_device)
-        prompt = 3 * [inputs.pop("prompt")]
-
-        text_inputs = stable_audio_pipe.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=stable_audio_pipe.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        ).to(torch_device)
-        text_input_ids = text_inputs.input_ids
-        attention_mask = text_inputs.attention_mask
-
-        prompt_embeds = stable_audio_pipe.text_encoder(
-            text_input_ids,
-            attention_mask=attention_mask,
-        )[0]
-
-        inputs["prompt_embeds"] = prompt_embeds
-        inputs["attention_mask"] = attention_mask
-
-        # forward
-        output = stable_audio_pipe(**inputs)
-        audio_2 = output.audios[0]
-
-        assert (audio_1 - audio_2).abs().max() < 1e-2
-
-    def test_stable_audio_negative_without_prompts(self):
-        components = self.get_dummy_components()
-        stable_audio_pipe = StableAudioPipeline(**components)
-        stable_audio_pipe = stable_audio_pipe.to(torch_device)
-        stable_audio_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(torch_device)
-        negative_prompt = 3 * ["this is a negative prompt"]
-        inputs["negative_prompt"] = negative_prompt
-        inputs["prompt"] = 3 * [inputs["prompt"]]
-
-        # forward
-        output = stable_audio_pipe(**inputs)
-        audio_1 = output.audios[0]
-
-        inputs = self.get_dummy_inputs(torch_device)
-        prompt = 3 * [inputs.pop("prompt")]
-
-        text_inputs = stable_audio_pipe.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=stable_audio_pipe.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        ).to(torch_device)
-        text_input_ids = text_inputs.input_ids
-        attention_mask = text_inputs.attention_mask
-
-        prompt_embeds = stable_audio_pipe.text_encoder(
-            text_input_ids,
-            attention_mask=attention_mask,
-        )[0]
-
-        inputs["prompt_embeds"] = prompt_embeds
-        inputs["attention_mask"] = attention_mask
-
-        negative_text_inputs = stable_audio_pipe.tokenizer(
-            negative_prompt,
-            padding="max_length",
-            max_length=stable_audio_pipe.tokenizer.model_max_length,
-            truncation=True,
-            return_tensors="pt",
-        ).to(torch_device)
-        negative_text_input_ids = negative_text_inputs.input_ids
-        negative_attention_mask = negative_text_inputs.attention_mask
-
-        negative_prompt_embeds = stable_audio_pipe.text_encoder(
-            negative_text_input_ids,
-            attention_mask=negative_attention_mask,
-        )[0]
-
-        inputs["negative_prompt_embeds"] = negative_prompt_embeds
-        inputs["negative_attention_mask"] = negative_attention_mask
-
-        # forward
-        output = stable_audio_pipe(**inputs)
-        audio_2 = output.audios[0]
-
-        assert (audio_1 - audio_2).abs().max() < 1e-2
-
-    def test_stable_audio_negative_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        stable_audio_pipe = StableAudioPipeline(**components)
-        stable_audio_pipe = stable_audio_pipe.to(device)
-        stable_audio_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        negative_prompt = "egg cracking"
-        output = stable_audio_pipe(**inputs, negative_prompt=negative_prompt)
-        audio = output.audios[0]
-
-        assert audio.ndim == 2
-        assert audio.shape == (2, 7)
-
-    def test_stable_audio_num_waveforms_per_prompt(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        stable_audio_pipe = StableAudioPipeline(**components)
-        stable_audio_pipe = stable_audio_pipe.to(device)
-        stable_audio_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A hammer hitting a wooden surface"
-
-        # test num_waveforms_per_prompt=1 (default)
-        audios = stable_audio_pipe(prompt, num_inference_steps=2).audios
-
-        assert audios.shape == (1, 2, 7)
-
-        # test num_waveforms_per_prompt=1 (default) for batch of prompts
-        batch_size = 2
-        audios = stable_audio_pipe([prompt] * batch_size, num_inference_steps=2).audios
-
-        assert audios.shape == (batch_size, 2, 7)
-
-        # test num_waveforms_per_prompt for single prompt
-        num_waveforms_per_prompt = 2
-        audios = stable_audio_pipe(
-            prompt, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
-        ).audios
-
-        assert audios.shape == (num_waveforms_per_prompt, 2, 7)
-
-        # test num_waveforms_per_prompt for batch of prompts
-        batch_size = 2
-        audios = stable_audio_pipe(
-            [prompt] * batch_size, num_inference_steps=2, num_waveforms_per_prompt=num_waveforms_per_prompt
-        ).audios
-
-        assert audios.shape == (batch_size * num_waveforms_per_prompt, 2, 7)
-
-    def test_stable_audio_audio_end_in_s(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        stable_audio_pipe = StableAudioPipeline(**components)
-        stable_audio_pipe = stable_audio_pipe.to(torch_device)
-        stable_audio_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        output = stable_audio_pipe(audio_end_in_s=1.5, **inputs)
-        audio = output.audios[0]
-
-        assert audio.ndim == 2
-        assert audio.shape[1] / stable_audio_pipe.vae.sampling_rate == 1.5
-
-        output = stable_audio_pipe(audio_end_in_s=1.1875, **inputs)
-        audio = output.audios[0]
-
-        assert audio.ndim == 2
-        assert audio.shape[1] / stable_audio_pipe.vae.sampling_rate == 1.0
-
-    def test_attention_slicing_forward_pass(self):
-        self._test_attention_slicing_forward_pass(test_mean_pixel_difference=False)
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(expected_max_diff=5e-4)
-
-    @unittest.skipIf(
-        torch_device != "cuda" or not is_xformers_available(),
-        reason="XFormers attention is only available with CUDA and `xformers` installed",
-    )
-    def test_xformers_attention_forwardGenerator_pass(self):
-        self._test_xformers_attention_forwardGenerator_pass(test_mean_pixel_difference=False)
-
-    def test_stable_audio_input_waveform(self):
-        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
-        components = self.get_dummy_components()
-        stable_audio_pipe = StableAudioPipeline(**components)
-        stable_audio_pipe = stable_audio_pipe.to(device)
-        stable_audio_pipe.set_progress_bar_config(disable=None)
-
-        prompt = "A hammer hitting a wooden surface"
-
-        initial_audio_waveforms = torch.ones((1, 5))
-
-        # test raises error when no sampling rate
-        with self.assertRaises(ValueError):
-            audios = stable_audio_pipe(
-                prompt, num_inference_steps=2, initial_audio_waveforms=initial_audio_waveforms
-            ).audios
-
-        # test raises error when wrong sampling rate
-        with self.assertRaises(ValueError):
-            audios = stable_audio_pipe(
-                prompt,
-                num_inference_steps=2,
-                initial_audio_waveforms=initial_audio_waveforms,
-                initial_audio_sampling_rate=stable_audio_pipe.vae.sampling_rate - 1,
-            ).audios
-
-        audios = stable_audio_pipe(
-            prompt,
-            num_inference_steps=2,
-            initial_audio_waveforms=initial_audio_waveforms,
-            initial_audio_sampling_rate=stable_audio_pipe.vae.sampling_rate,
-        ).audios
-        assert audios.shape == (1, 2, 7)
-
-        # test works with num_waveforms_per_prompt
-        num_waveforms_per_prompt = 2
-        audios = stable_audio_pipe(
-            prompt,
-            num_inference_steps=2,
-            num_waveforms_per_prompt=num_waveforms_per_prompt,
-            initial_audio_waveforms=initial_audio_waveforms,
-            initial_audio_sampling_rate=stable_audio_pipe.vae.sampling_rate,
-        ).audios
-
-        assert audios.shape == (num_waveforms_per_prompt, 2, 7)
-
-        # test num_waveforms_per_prompt for batch of prompts and input audio (two channels)
-        batch_size = 2
-        initial_audio_waveforms = torch.ones((batch_size, 2, 5))
-        audios = stable_audio_pipe(
-            [prompt] * batch_size,
-            num_inference_steps=2,
-            num_waveforms_per_prompt=num_waveforms_per_prompt,
-            initial_audio_waveforms=initial_audio_waveforms,
-            initial_audio_sampling_rate=stable_audio_pipe.vae.sampling_rate,
-        ).audios
-
-        assert audios.shape == (batch_size * num_waveforms_per_prompt, 2, 7)
-
-    @unittest.skip("Not supported yet")
-    def test_sequential_cpu_offload_forward_pass(self):
-        pass
-
-    @unittest.skip("Not supported yet")
-    def test_sequential_offload_forward_pass_twice(self):
-        pass
-
-
-@nightly
-@require_torch_gpu
-class StableAudioPipelineIntegrationTests(unittest.TestCase):
-    def setUp(self):
-        super().setUp()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def tearDown(self):
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def get_inputs(self, device, generator_device="cpu", dtype=torch.float32, seed=0):
-        generator = torch.Generator(device=generator_device).manual_seed(seed)
-        latents = np.random.RandomState(seed).standard_normal((1, 64, 1024))
-        latents = torch.from_numpy(latents).to(device=device, dtype=dtype)
-        inputs = {
-            "prompt": "A hammer hitting a wooden surface",
-            "latents": latents,
-            "generator": generator,
-            "num_inference_steps": 3,
-            "audio_end_in_s": 30,
-            "guidance_scale": 2.5,
-        }
-        return inputs
-
-    def test_stable_audio(self):
-        stable_audio_pipe = StableAudioPipeline.from_pretrained("stabilityai/stable-audio-open-1.0")
-        stable_audio_pipe = stable_audio_pipe.to(torch_device)
-        stable_audio_pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_inputs(torch_device)
-        inputs["num_inference_steps"] = 25
-        audio = stable_audio_pipe(**inputs).audios[0]
-
-        assert audio.ndim == 2
-        assert audio.shape == (2, int(inputs["audio_end_in_s"] * stable_audio_pipe.vae.sampling_rate))
-        # check the portion of the generated audio with the largest dynamic range (reduces flakiness)
-        audio_slice = audio[0, 447590:447600]
-        # fmt: off
-        expected_slice = np.array(
-            [-0.0278,  0.1096,  0.1877,  0.3178,  0.5329,  0.6990,  0.6972,  0.6186, 0.5608,  0.5060]
-        )
-         # fmt: one
-        max_diff = np.abs(expected_slice - audio_slice.detach().cpu().numpy()).max()
-        assert max_diff < 1.5e-3
Author	SHA1	Message	Date
Dhruv Nair	01de911cdf	update	2024-07-26 07:30:35 +00:00
Dhruv Nair	9872787810	update	2024-07-26 07:29:56 +00:00
Dhruv Nair	7a35a3f71c	update	2024-07-26 03:48:37 +00:00