update

2024-02-16 10:58:39 +00:00
76 changed files with 1201 additions and 4182 deletions
@@ -52,8 +52,6 @@
      title: Image-to-image
    - local: using-diffusers/inpaint
      title: Inpainting
-    - local: using-diffusers/text-img2vid
-      title: Text or image-to-video
    - local: using-diffusers/depth2img
      title: Depth-to-image
    title: Tasks
@@ -325,8 +323,6 @@
        title: Text-to-image
      - local: api/pipelines/stable_diffusion/img2img
        title: Image-to-image
-      - local: api/pipelines/stable_diffusion/svd
-        title: Image-to-video
      - local: api/pipelines/stable_diffusion/inpaint
        title: Inpainting
      - local: api/pipelines/stable_diffusion/depth2img
@@ -20,24 +20,6 @@ An attention processor is a class for applying different types of attention mech
 ## AttnProcessor2_0
 [[autodoc]] models.attention_processor.AttnProcessor2_0

-## AttnAddedKVProcessor
-[[autodoc]] models.attention_processor.AttnAddedKVProcessor
-
-## AttnAddedKVProcessor2_0
-[[autodoc]] models.attention_processor.AttnAddedKVProcessor2_0
-
-## CrossFrameAttnProcessor
-[[autodoc]] pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor
-
-## CustomDiffusionAttnProcessor
-[[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor
-
-## CustomDiffusionAttnProcessor2_0
-[[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor2_0
-
-## CustomDiffusionXFormersAttnProcessor
-[[autodoc]] models.attention_processor.CustomDiffusionXFormersAttnProcessor
-
 ## FusedAttnProcessor2_0
 [[autodoc]] models.attention_processor.FusedAttnProcessor2_0

@@ -47,17 +29,32 @@ An attention processor is a class for applying different types of attention mech
 ## LoRAAttnProcessor2_0
 [[autodoc]] models.attention_processor.LoRAAttnProcessor2_0

+## CustomDiffusionAttnProcessor
+[[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor
+
+## CustomDiffusionAttnProcessor2_0
+[[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor2_0
+
+## AttnAddedKVProcessor
+[[autodoc]] models.attention_processor.AttnAddedKVProcessor
+
+## AttnAddedKVProcessor2_0
+[[autodoc]] models.attention_processor.AttnAddedKVProcessor2_0
+
 ## LoRAAttnAddedKVProcessor
 [[autodoc]] models.attention_processor.LoRAAttnAddedKVProcessor

+## XFormersAttnProcessor
+[[autodoc]] models.attention_processor.XFormersAttnProcessor
+
 ## LoRAXFormersAttnProcessor
 [[autodoc]] models.attention_processor.LoRAXFormersAttnProcessor

+## CustomDiffusionXFormersAttnProcessor
+[[autodoc]] models.attention_processor.CustomDiffusionXFormersAttnProcessor
+
 ## SlicedAttnProcessor
 [[autodoc]] models.attention_processor.SlicedAttnProcessor

 ## SlicedAttnAddedKVProcessor
 [[autodoc]] models.attention_processor.SlicedAttnAddedKVProcessor
-
-## XFormersAttnProcessor
-[[autodoc]] models.attention_processor.XFormersAttnProcessor
@@ -408,91 +408,6 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)

 </Tip>

-## Using AnimateLCM
-
-[AnimateLCM](https://animatelcm.github.io/) is a motion module checkpoint and an [LCM LoRA](https://huggingface.co/docs/diffusers/using-diffusers/inference_with_lcm_lora) that have been created using a consistency learning strategy that decouples the distillation of the image generation priors and the motion generation priors.
-
-```python
-import torch
-from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
-from diffusers.utils import export_to_gif
-
-adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")
-pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter)
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
-
-pipe.load_lora_weights("wangfuyun/AnimateLCM", weight_name="sd15_lora_beta.safetensors", adapter_name="lcm-lora")
-
-pipe.enable_vae_slicing()
-pipe.enable_model_cpu_offload()
-
-output = pipe(
-    prompt="A space rocket with trails of smoke behind it launching into space from the desert, 4k, high resolution",
-    negative_prompt="bad quality, worse quality, low resolution",
-    num_frames=16,
-    guidance_scale=1.5,
-    num_inference_steps=6,
-    generator=torch.Generator("cpu").manual_seed(0),
-)
-frames = output.frames[0]
-export_to_gif(frames, "animatelcm.gif")
-```
-
-<table>
-    <tr>
-        <td><center>
-        A space rocket, 4K.
-        <br>
-        <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatelcm-output.gif"
-            alt="A space rocket, 4K"
-            style="width: 300px;" />
-        </center></td>
-    </tr>
-</table>
-
-AnimateLCM is also compatible with existing [Motion LoRAs](https://huggingface.co/collections/dn6/animatediff-motion-loras-654cb8ad732b9e3cf4d3c17e).
-
-```python
-import torch
-from diffusers import AnimateDiffPipeline, LCMScheduler, MotionAdapter
-from diffusers.utils import export_to_gif
-
-adapter = MotionAdapter.from_pretrained("wangfuyun/AnimateLCM")
-pipe = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter)
-pipe.scheduler = LCMScheduler.from_config(pipe.scheduler.config, beta_schedule="linear")
-
-pipe.load_lora_weights("wangfuyun/AnimateLCM", weight_name="sd15_lora_beta.safetensors", adapter_name="lcm-lora")
-pipe.load_lora_weights("guoyww/animatediff-motion-lora-tilt-up", adapter_name="tilt-up")
-
-pipe.set_adapters(["lcm-lora", "tilt-up"], [1.0, 0.8])
-pipe.enable_vae_slicing()
-pipe.enable_model_cpu_offload()
-
-output = pipe(
-    prompt="A space rocket with trails of smoke behind it launching into space from the desert, 4k, high resolution",
-    negative_prompt="bad quality, worse quality, low resolution",
-    num_frames=16,
-    guidance_scale=1.5,
-    num_inference_steps=6,
-    generator=torch.Generator("cpu").manual_seed(0),
-)
-frames = output.frames[0]
-export_to_gif(frames, "animatelcm-motion-lora.gif")
-```
-
-<table>
-    <tr>
-        <td><center>
-        A space rocket, 4K.
-        <br>
-        <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatelcm-motion-lora.gif"
-            alt="A space rocket, 4K"
-            style="width: 300px;" />
-        </center></td>
-    </tr>
-</table>
-
-
 ## AnimateDiffPipeline

 [[autodoc]] AnimateDiffPipeline
@@ -1,43 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Stable Video Diffusion
-
-Stable Video Diffusion was proposed in [Stable Video Diffusion: Scaling Latent Video Diffusion Models to Large Datasets](https://hf.co/papers/2311.15127) by Andreas Blattmann, Tim Dockhorn, Sumith Kulal, Daniel Mendelevitch, Maciej Kilian, Dominik Lorenz, Yam Levi, Zion English, Vikram Voleti, Adam Letts, Varun Jampani, Robin Rombach.
-
-The abstract from the paper is:
-
-*We present Stable Video Diffusion - a latent video diffusion model for high-resolution, state-of-the-art text-to-video and image-to-video generation. Recently, latent diffusion models trained for 2D image synthesis have been turned into generative video models by inserting temporal layers and finetuning them on small, high-quality video datasets. However, training methods in the literature vary widely, and the field has yet to agree on a unified strategy for curating video data. In this paper, we identify and evaluate three different stages for successful training of video LDMs: text-to-image pretraining, video pretraining, and high-quality video finetuning. Furthermore, we demonstrate the necessity of a well-curated pretraining dataset for generating high-quality videos and present a systematic curation process to train a strong base model, including captioning and filtering strategies. We then explore the impact of finetuning our base model on high-quality data and train a text-to-video model that is competitive with closed-source video generation. We also show that our base model provides a powerful motion representation for downstream tasks such as image-to-video generation and adaptability to camera motion-specific LoRA modules. Finally, we demonstrate that our model provides a strong multi-view 3D-prior and can serve as a base to finetune a multi-view diffusion model that jointly generates multiple views of objects in a feedforward fashion, outperforming image-based methods at a fraction of their compute budget. We release code and model weights at this https URL.*
-
-<Tip>
-
-To learn how to use Stable Video Diffusion, take a look at the [Stable Video Diffusion](../../../using-diffusers/svd) guide.
-
-<br>
-
-Check out the [Stability AI](https://huggingface.co/stabilityai) Hub organization for the [base](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) and [extended frame](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt) checkpoints!
-
-</Tip>
-
-## Tips
-
-Video generation is memory-intensive and one way to reduce your memory usage is to set `enable_forward_chunking` on the pipeline's UNet so you don't run the entire feedforward layer at once. Breaking it up into chunks in a loop is more efficient.
-
-Check out the [Text or image-to-video](text-img2vid) guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.
-
-## StableVideoDiffusionPipeline
-
-[[autodoc]] StableVideoDiffusionPipeline
-
-## StableVideoDiffusionPipelineOutput
-
-[[autodoc]] pipelines.stable_video_diffusion.StableVideoDiffusionPipelineOutput
@@ -167,12 +167,6 @@ Here are some sample outputs:
    </tr>
 </table>

-## Tips
-
-Video generation is memory-intensive and one way to reduce your memory usage is to set `enable_forward_chunking` on the pipeline's UNet so you don't run the entire feedforward layer at once. Breaking it up into chunks in a loop is more efficient.
-
-Check out the [Text or image-to-video](text-img2vid) guide for more details about how certain parameters can affect video generation and how to optimize inference by reducing memory usage.
-
 <Tip>

 Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
@@ -245,7 +245,7 @@ Generating accurate faces is challenging because they are complex and nuanced. D
 * [ip-adapter-full-face_sd15.safetensors](https://huggingface.co/h94/IP-Adapter/blob/main/models/ip-adapter-full-face_sd15.safetensors) is conditioned with images of cropped faces and removed backgrounds
 * [ip-adapter-plus-face_sd15.safetensors](https://huggingface.co/h94/IP-Adapter/blob/main/models/ip-adapter-plus-face_sd15.safetensors) uses patch embeddings and is conditioned with images of cropped faces

-> [!TIP]
+> [TIP]
 > [IP-Adapter-FaceID](https://huggingface.co/h94/IP-Adapter-FaceID) is a face-specific IP-Adapter trained with face ID embeddings instead of CLIP image embeddings, allowing you to generate more consistent faces in different contexts and styles. Try out this popular [community pipeline](https://github.com/huggingface/diffusers/tree/main/examples/community#ip-adapter-face-id) and see how it compares to the other face IP-Adapters.

 For face models, use the [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter) checkpoint. It is also recommended to use [`DDIMScheduler`] or [`EulerDiscreteScheduler`] for face models.
@@ -468,83 +468,3 @@ image
 <div class="flex justify-center">
     <img src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ipa-controlnet-out.png" />
 </div>
-
-### IP-Adapter masking
-
-Binary masks can be used to specify which portion of the output image should be assigned to an IP-Adapter.
-For each input IP-Adapter image, a binary mask and an IP-Adapter must be provided.
-
-Before passing the masks to the pipeline, it's essential to preprocess them using [`IPAdapterMaskProcessor.preprocess()`].
-
-> [!TIP]
-> For optimal results, provide the output height and width to [`IPAdapterMaskProcessor.preprocess()`]. This ensures that masks with differing aspect ratios are appropriately stretched. If the input masks already match the aspect ratio of the generated image, specifying height and width can be omitted.
-
-Here an example with two masks:
-
-```py
-from diffusers.image_processor import IPAdapterMaskProcessor
-
-mask1 = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_mask1.png")
-mask2 = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_mask2.png")
-
-output_height = 1024
-output_width = 1024
-
-processor = IPAdapterMaskProcessor()
-masks = processor.preprocess([mask1, mask2], height=output_height, width=output_width)
-```
-
-<div class="flex flex-row gap-4">
-  <div class="flex-1">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_mask1.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">mask one</figcaption>
-  </div>
-  <div class="flex-1">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_mask2.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">mask two</figcaption>
-  </div>
-</div>
-
-If you have more than one IP-Adapter image, load them into a list, ensuring each image is assigned to a different IP-Adapter.
-
-```py
-face_image1 = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_girl1.png")
-face_image2 = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_girl2.png")
-
-ip_images =[[image1], [image2]]
-
-```
-
-<div class="flex flex-row gap-4">
-  <div class="flex-1">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_girl1.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">ip adapter image one</figcaption>
-  </div>
-  <div class="flex-1">
-    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_girl2.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">ip adapter image two</figcaption>
-  </div>
-</div>
-
-Pass preprocessed masks to the pipeline using `cross_attention_kwargs` as shown below:
-
-```py
-
-pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"] * 2)
-pipeline.set_ip_adapter_scale([0.7] * 2)
-generator = torch.Generator(device="cpu").manual_seed(0)
-num_images=1
-
-image = pipeline(
-    prompt="2 girls",
-    ip_adapter_image=ip_images,
-    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality", 
-    num_inference_steps=20, num_images_per_prompt=num_images, 
-    generator=generator, cross_attention_kwargs={"ip_adapter_masks": masks}
-).images[0]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_attention_mask_result_seed_0.png" />
-   <figcaption class="mt-2 text-center text-sm text-gray-500">output image</figcaption>
-</div>
@@ -63,12 +63,11 @@ from diffusers import StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipelin
 import torch

 pipeline = StableDiffusionXLPipeline.from_single_file(
-    "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors", 
-    torch_dtype=torch.float16
+    "https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/blob/main/sd_xl_base_1.0.safetensors", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
 ).to("cuda")

 refiner = StableDiffusionXLImg2ImgPipeline.from_single_file(
-    "https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/blob/main/sd_xl_refiner_1.0.safetensors", torch_dtype=torch.float16
+    "https://huggingface.co/stabilityai/stable-diffusion-xl-refiner-1.0/blob/main/sd_xl_refiner_1.0.safetensors", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
 ).to("cuda")
 ```

@@ -1,497 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Text or image-to-video
-
-Driven by the success of text-to-image diffusion models, generative video models are able to generate short clips of video from a text prompt or an initial image. These models extend a pretrained diffusion model to generate videos by adding some type of temporal and/or spatial convolution layer to the architecture. A mixed dataset of images and videos are used to train the model which learns to output a series of video frames based on the text or image conditioning.
-
-This guide will show you how to generate videos, how to configure video model parameters, and how to control video generation.
-
-## Popular models
-
-> [!TIP]
-> Discover other cool and trending video generation models on the Hub [here](https://huggingface.co/models?pipeline_tag=text-to-video&sort=trending)!
-
-[Stable Video Diffusions (SVD)](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid), [I2VGen-XL](https://huggingface.co/ali-vilab/i2vgen-xl/), [AnimateDiff](https://huggingface.co/guoyww/animatediff), and [ModelScopeT2V](https://huggingface.co/ali-vilab/text-to-video-ms-1.7b) are popular models used for video diffusion. Each model is distinct. For example, AnimateDiff inserts a motion modeling module into a frozen text-to-image model to generate personalized animated images, whereas SVD is entirely pretrained from scratch with a three-stage training process to generate short high-quality videos.
-
-### Stable Video Diffusion
-
-[SVD](../api/pipelines/svd) is based on the Stable Diffusion 2.1 model and it is trained on images, then low-resolution videos, and finally a smaller dataset of high-resolution videos. This model generates a short 2-4 second video from an initial image. You can learn more details about model, like micro-conditioning, in the [Stable Video Diffusion](../using-diffusers/svd) guide.
-
-Begin by loading the [`StableVideoDiffusionPipeline`] and passing an initial image to generate a video from.
-
-```py
-import torch
-from diffusers import StableVideoDiffusionPipeline
-from diffusers.utils import load_image, export_to_video
-
-pipeline = StableVideoDiffusionPipeline.from_pretrained(
-    "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
-)
-pipeline.enable_model_cpu_offload()
-
-image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png")
-image = image.resize((1024, 576))
-
-generator = torch.manual_seed(42)
-frames = pipeline(image, decode_chunk_size=8, generator=generator).frames[0]
-export_to_video(frames, "generated.mp4", fps=7)
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/output_rocket.gif"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">generated video</figcaption>
-  </div>
-</div>
-
-### I2VGen-XL
-
-[I2VGen-XL](../api/pipelines/i2vgenxl) is a diffusion model that can generate higher resolution videos than SVD and it is also capable of accepting text prompts in addition to images. The model is trained with two hierarchical encoders (detail and global encoder) to better capture low and high-level details in images. These learned details are used to train a video diffusion model which refines the video resolution and details in the generated video.
-
-You can use I2VGen-XL by loading the [`I2VGenXLPipeline`], and passing a text and image prompt to generate a video.
-
-```py
-import torch
-from diffusers import I2VGenXLPipeline
-from diffusers.utils import export_to_gif, load_image
-
-pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
-pipeline.enable_model_cpu_offload()
-
-image_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
-image = load_image(image_url).convert("RGB")
-
-prompt = "Papers were floating in the air on a table in the library"
-negative_prompt = "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"
-generator = torch.manual_seed(8888)
-
-frames = pipeline(
-    prompt=prompt,
-    image=image,
-    num_inference_steps=50,
-    negative_prompt=negative_prompt,
-    guidance_scale=9.0,
-    generator=generator
-).frames[0]
-export_to_gif(frames, "i2v.gif")
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/i2vgen-xl-example.gif"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">generated video</figcaption>
-  </div>
-</div>
-
-### AnimateDiff
-
-[AnimateDiff](../api/pipelines/animatediff) is an adapter model that inserts a motion module into a pretrained diffusion model to animate an image. The adapter is trained on video clips to learn motion which is used to condition the generation process to create a video. It is faster and easier to only train the adapter and it can be loaded into most diffusion models, effectively turning them into "video models".
-
-Start by loading a [`MotionAdapter`].
-
-```py
-import torch
-from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
-from diffusers.utils import export_to_gif
-
-adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
-```
-
-Then load a finetuned Stable Diffusion model with the [`AnimateDiffPipeline`].
-
-```py
-pipeline = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
-scheduler = DDIMScheduler.from_pretrained(
-    "emilianJR/epiCRealism",
-    subfolder="scheduler",
-    clip_sample=False,
-    timestep_spacing="linspace",
-    beta_schedule="linear",
-    steps_offset=1,
-)
-pipeline.scheduler = scheduler
-pipeline.enable_vae_slicing()
-pipeline.enable_model_cpu_offload()
-```
-
-Create a prompt and generate the video.
-
-```py
-output = pipeline(
-    prompt="A space rocket with trails of smoke behind it launching into space from the desert, 4k, high resolution",
-    negative_prompt="bad quality, worse quality, low resolution",
-    num_frames=16,
-    guidance_scale=7.5,
-    num_inference_steps=50,
-    generator=torch.Generator("cpu").manual_seed(49),
-)
-frames = output.frames[0]
-export_to_gif(frames, "animation.gif")
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff.gif"/>
-</div>
-
-### ModelscopeT2V
-
-[ModelscopeT2V](../api/pipelines/text_to_video) adds spatial and temporal convolutions and attention to a UNet, and it is trained on image-text and video-text datasets to enhance what it learns during training. The model takes a prompt, encodes it and creates text embeddings which are denoised by the UNet, and then decoded by a VQGAN into a video.
-
-<Tip>
-
-ModelScopeT2V generates watermarked videos due to the datasets it was trained on. To use a watermark-free model, try the [cerspense/zeroscope_v2_76w](https://huggingface.co/cerspense/zeroscope_v2_576w) model with the [`TextToVideoSDPipeline`] first, and then upscale it's output with the [cerspense/zeroscope_v2_XL](https://huggingface.co/cerspense/zeroscope_v2_XL) checkpoint using the [`VideoToVideoSDPipeline`].
-
-</Tip>
-
-Load a ModelScopeT2V checkpoint into the [`DiffusionPipeline`] along with a prompt to generate a video.
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-from diffusers.utils import export_to_video
-
-pipeline = DiffusionPipeline.from_pretrained("damo-vilab/text-to-video-ms-1.7b", torch_dtype=torch.float16, variant="fp16")
-pipeline.enable_model_cpu_offload()
-pipeline.enable_vae_slicing()
-
-prompt = "Confident teddy bear surfer rides the wave in the tropics"
-video_frames = pipeline(prompt).frames[0]
-export_to_video(video_frames, "modelscopet2v.mp4", fps=10)
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/modelscopet2v.gif" />
-</div>
-
-## Configure model parameters
-
-There are a few important parameters you can configure in the pipeline that'll affect the video generation process and quality. Let's take a closer look at what these parameters do and how changing them affects the output.
-
-### Number of frames
-
-The `num_frames` parameter determines how many video frames are generated per second. A frame is an image that is played in a sequence of other frames to create motion or a video. This affects video length because the pipeline generates a certain number of frames per second (check a pipeline's API reference for the default value). To increase the video duration, you'll need to increase the `num_frames` parameter.
-
-```py
-import torch
-from diffusers import StableVideoDiffusionPipeline
-from diffusers.utils import load_image, export_to_video
-
-pipeline = StableVideoDiffusionPipeline.from_pretrained(
-    "stabilityai/stable-video-diffusion-img2vid", torch_dtype=torch.float16, variant="fp16"
-)
-pipeline.enable_model_cpu_offload()
-
-image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png")
-image = image.resize((1024, 576))
-
-generator = torch.manual_seed(42)
-frames = pipeline(image, decode_chunk_size=8, generator=generator, num_frames=25).frames[0]
-export_to_video(frames, "generated.mp4", fps=7)
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/num_frames_14.gif"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">num_frames=14</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/num_frames_25.gif"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">num_frames=25</figcaption>
-  </div>
-</div>
-
-### Guidance scale
-
-The `guidance_scale` parameter controls how closely aligned the generated video and text prompt or initial image is. A higher `guidance_scale` value means your generated video is more aligned with the text prompt or initial image, while a lower `guidance_scale` value means your generated video is less aligned which could give the model more "creativity" to interpret the conditioning input.
-
-<Tip>
-
-SVD uses the `min_guidance_scale` and `max_guidance_scale` parameters for applying guidance to the first and last frames respectively.
-
-</Tip>
-
-```py
-import torch
-from diffusers import I2VGenXLPipeline
-from diffusers.utils import export_to_gif, load_image
-
-pipeline = I2VGenXLPipeline.from_pretrained("ali-vilab/i2vgen-xl", torch_dtype=torch.float16, variant="fp16")
-pipeline.enable_model_cpu_offload()
-
-image_url = "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/i2vgen_xl_images/img_0009.png"
-image = load_image(image_url).convert("RGB")
-
-prompt = "Papers were floating in the air on a table in the library"
-negative_prompt = "Distorted, discontinuous, Ugly, blurry, low resolution, motionless, static, disfigured, disconnected limbs, Ugly faces, incomplete arms"
-generator = torch.manual_seed(0)
-
-frames = pipeline(
-    prompt=prompt,
-    image=image,
-    num_inference_steps=50,
-    negative_prompt=negative_prompt,
-    guidance_scale=1.0,
-    generator=generator
-).frames[0]
-export_to_gif(frames, "i2v.gif")
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/i2vgen-xl-example.gif"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">guidance_scale=9.0</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/guidance_scale_1.0.gif"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">guidance_scale=1.0</figcaption>
-  </div>
-</div>
-
-### Negative prompt
-
-A negative prompt deters the model from generating things you don’t want it to. This parameter is commonly used to improve overall generation quality by removing poor or bad features such as “low resolution” or “bad details”.
-
-```py
-import torch
-from diffusers import AnimateDiffPipeline, DDIMScheduler, MotionAdapter
-from diffusers.utils import export_to_gif
-
-adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
-
-pipeline = AnimateDiffPipeline.from_pretrained("emilianJR/epiCRealism", motion_adapter=adapter, torch_dtype=torch.float16)
-scheduler = DDIMScheduler.from_pretrained(
-    "emilianJR/epiCRealism",
-    subfolder="scheduler",
-    clip_sample=False,
-    timestep_spacing="linspace",
-    beta_schedule="linear",
-    steps_offset=1,
-)
-pipeline.scheduler = scheduler
-pipeline.enable_vae_slicing()
-pipeline.enable_model_cpu_offload()
-
-output = pipeline(
-    prompt="360 camera shot of a sushi roll in a restaurant",
-    negative_prompt="Distorted, discontinuous, ugly, blurry, low resolution, motionless, static",
-    num_frames=16,
-    guidance_scale=7.5,
-    num_inference_steps=50,
-    generator=torch.Generator("cpu").manual_seed(0),
-)
-frames = output.frames[0]
-export_to_gif(frames, "animation.gif")
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff_no_neg.gif"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">no negative prompt</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff_neg.gif"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">negative prompt applied</figcaption>
-  </div>
-</div>
-
-### Model-specific parameters
-
-There are some pipeline parameters that are unique to each model such as adjusting the motion in a video or adding noise to the initial image.
-
-<hfoptions id="special-parameters">
-<hfoption id="Stable Video Diffusion">
-
-Stable Video Diffusion provides additional micro-conditioning for the frame rate with the `fps` parameter and for motion with the `motion_bucket_id` parameter. Together, these parameters allow for adjusting the amount of motion in the generated video.
-
-There is also a `noise_aug_strength` parameter that increases the amount of noise added to the initial image. Varying this parameter affects how similar the generated video and initial image are. A higher `noise_aug_strength` also increases the amount of motion. To learn more, read the [Micro-conditioning](../using-diffusers/svd#micro-conditioning) guide.
-
-</hfoption>
-<hfoption id="Text2Video-Zero">
-
-Text2Video-Zero computes the amount of motion to apply to each frame from randomly sampled latents. You can use the `motion_field_strength_x` and `motion_field_strength_y` parameters to control the amount of motion to apply to the x and y-axes of the video. The parameters `t0` and `t1` are the timesteps to apply motion to the latents.
-
-</hfoption>
-</hfoptions>
-
-## Control video generation
-
-Video generation can be controlled similar to how text-to-image, image-to-image, and inpainting can be controlled with a [`ControlNetModel`]. The only difference is you need to use the [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor`] so each frame attends to the first frame.
-
-### Text2Video-Zero
-
-Text2Video-Zero video generation can be conditioned on pose and edge images for even greater control over a subject's motion in the generated video or to preserve the identity of a subject/object in the video. You can also use Text2Video-Zero with [InstructPix2Pix](../api/pipelines/pix2pix) for editing videos with text.
-
-<hfoptions id="t2v-zero">
-<hfoption id="pose control">
-
-Start by downloading a video and extracting the pose images from it.
-
-```py
-from huggingface_hub import hf_hub_download
-from PIL import Image
-import imageio
-
-filename = "__assets__/poses_skeleton_gifs/dance1_corr.mp4"
-repo_id = "PAIR/Text2Video-Zero"
-video_path = hf_hub_download(repo_type="space", repo_id=repo_id, filename=filename)
-
-reader = imageio.get_reader(video_path, "ffmpeg")
-frame_count = 8
-pose_images = [Image.fromarray(reader.get_data(i)) for i in range(frame_count)]
-```
-
-Load a [`ControlNetModel`] for pose estimation and a checkpoint into the [`StableDiffusionControlNetPipeline`]. Then you'll use the [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor`] for the UNet and ControlNet.
-
-```py
-import torch
-from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
-from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
-
-model_id = "runwayml/stable-diffusion-v1-5"
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-openpose", torch_dtype=torch.float16)
-pipeline = StableDiffusionControlNetPipeline.from_pretrained(
-    model_id, controlnet=controlnet, torch_dtype=torch.float16
-).to("cuda")
-
-pipeline.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
-pipeline.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
-```
-
-Fix the latents for all the frames, and then pass your prompt and extracted pose images to the model to generate a video.
-
-```py
-latents = torch.randn((1, 4, 64, 64), device="cuda", dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)
-
-prompt = "Darth Vader dancing in a desert"
-result = pipeline(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images
-imageio.mimsave("video.mp4", result, fps=4)
-```
-
-</hfoption>
-<hfoption id="edge control">
-
-Download a video and extract the edges from it.
-
-```py
-from huggingface_hub import hf_hub_download
-from PIL import Image
-import imageio
-
-filename = "__assets__/poses_skeleton_gifs/dance1_corr.mp4"
-repo_id = "PAIR/Text2Video-Zero"
-video_path = hf_hub_download(repo_type="space", repo_id=repo_id, filename=filename)
-
-reader = imageio.get_reader(video_path, "ffmpeg")
-frame_count = 8
-pose_images = [Image.fromarray(reader.get_data(i)) for i in range(frame_count)]
-```
-
-Load a [`ControlNetModel`] for canny edge and a checkpoint into the [`StableDiffusionControlNetPipeline`]. Then you'll use the [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor`] for the UNet and ControlNet.
-
-```py
-import torch
-from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
-from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
-
-model_id = "runwayml/stable-diffusion-v1-5"
-controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16)
-pipeline = StableDiffusionControlNetPipeline.from_pretrained(
-    model_id, controlnet=controlnet, torch_dtype=torch.float16
-).to("cuda")
-
-pipeline.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
-pipeline.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
-```
-
-Fix the latents for all the frames, and then pass your prompt and extracted edge images to the model to generate a video.
-
-```py
-latents = torch.randn((1, 4, 64, 64), device="cuda", dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)
-
-prompt = "Darth Vader dancing in a desert"
-result = pipeline(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images
-imageio.mimsave("video.mp4", result, fps=4)
-```
-
-</hfoption>
-<hfoption id="InstructPix2Pix">
-
-InstructPix2Pix allows you to use text to describe the changes you want to make to the video. Start by downloading and reading a video.
-
-```py
-from huggingface_hub import hf_hub_download
-from PIL import Image
-import imageio
-
-filename = "__assets__/pix2pix video/camel.mp4"
-repo_id = "PAIR/Text2Video-Zero"
-video_path = hf_hub_download(repo_type="space", repo_id=repo_id, filename=filename)
-
-reader = imageio.get_reader(video_path, "ffmpeg")
-frame_count = 8
-video = [Image.fromarray(reader.get_data(i)) for i in range(frame_count)]
-```
-
-Load the [`StableDiffusionInstructPix2PixPipeline`] and set the [`~pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor`] for the UNet.
-
-```py
-import torch
-from diffusers import StableDiffusionInstructPix2PixPipeline
-from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
-
-pipeline = StableDiffusionInstructPix2PixPipeline.from_pretrained("timbrooks/instruct-pix2pix", torch_dtype=torch.float16).to("cuda")
-pipeline.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=3))
-```
-
-Pass a prompt describing the change you want to apply to the video.
-
-```py
-prompt = "make it Van Gogh Starry Night style"
-result = pipeline(prompt=[prompt] * len(video), image=video).images
-imageio.mimsave("edited_video.mp4", result, fps=4)
-```
-
-</hfoption>
-</hfoptions>
-
-## Optimize
-
-Video generation requires a lot of memory because you're generating many video frames at once. You can reduce your memory requirements at the expense of some inference speed. Try:
-
-1. offloading pipeline components that are no longer needed to the CPU
-2. feed-forward chunking runs the feed-forward layer in a loop instead of all at once
-3. break up the number of frames the VAE has to decode into chunks instead of decoding them all at once
-
-```diff
- pipeline.enable_model_cpu_offload()
- frames = pipeline(image, decode_chunk_size=8, generator=generator).frames[0]
-+ pipeline.enable_model_cpu_offload()
-+ pipeline.unet.enable_forward_chunking()
-+ frames = pipeline(image, decode_chunk_size=2, generator=generator, num_frames=25).frames[0]
-```
-
-If memory is not an issue and you want to optimize for speed, try wrapping the UNet with [`torch.compile`](../optimization/torch2.0#torchcompile).
-
-```diff
- pipeline.enable_model_cpu_offload()
-+ pipeline.to("cuda")
-+ pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
-```
@@ -313,12 +313,12 @@ from diffusers import StableDiffusionXLPipeline, StableDiffusionXLImg2ImgPipelin
 import torch

 pipe = StableDiffusionXLPipeline.from_single_file(
-    "./sd_xl_base_1.0.safetensors", torch_dtype=torch.float16
+    "./sd_xl_base_1.0.safetensors", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
 )
 pipe.to("cuda")

 refiner = StableDiffusionXLImg2ImgPipeline.from_single_file(
-    "./sd_xl_refiner_1.0.safetensors", torch_dtype=torch.float16
+    "./sd_xl_refiner_1.0.safetensors", torch_dtype=torch.float16, use_safetensors=True, variant="fp16"
 )
 refiner.to("cuda")
 ```
@@ -57,13 +57,12 @@ If a community doesn't work as expected, please open an issue and ping the autho
 |   DemoFusion Pipeline                                                                                                    | Implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [DemoFusion Pipeline](#DemoFusion)      | - |              [Ruoyi Du](https://github.com/RuoyiDu) |
 |   Instaflow Pipeline                                                                                                    | Implementation of [InstaFlow! One-Step Stable Diffusion with Rectified Flow](https://arxiv.org/abs/2309.06380)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Instaflow Pipeline](#instaflow-pipeline)      | - |              [Ayush Mangal](https://github.com/ayushtues) |
 |   Null-Text Inversion Pipeline  | Implement [Null-text Inversion for Editing Real Images using Guided Diffusion Models](https://arxiv.org/abs/2211.09794) as a pipeline.                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Null-Text Inversion](https://github.com/google/prompt-to-prompt/)      | - |              [Junsheng Luan](https://github.com/Junsheng121) |
-|   Rerender A Video Pipeline                                                                                                    | Implementation of [[SIGGRAPH Asia 2023] Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation](https://arxiv.org/abs/2306.07954)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Rerender A Video Pipeline](#Rerender-A-Video)      | - |              [Yifan Zhou](https://github.com/SingleZombie) |
+|   Rerender A Video Pipeline                                                                                                    | Implementation of [[SIGGRAPH Asia 2023] Rerender A Video: Zero-Shot Text-Guided Video-to-Video Translation](https://arxiv.org/abs/2306.07954)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Rerender A Video Pipeline](#Rerender_A_Video)      | - |              [Yifan Zhou](https://github.com/SingleZombie) |
 | StyleAligned Pipeline                                                                                                    | Implementation of [Style Aligned Image Generation via Shared Attention](https://arxiv.org/abs/2312.02133)                                                                                                                                                                                                                                                                                                                                                                                                                                   | [StyleAligned Pipeline](#stylealigned-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/file/d/15X2E0jFPTajUIjS0FzX50OaHsCbP2lQ0/view?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
 | AnimateDiff Image-To-Video Pipeline | Experimental Image-To-Video support for AnimateDiff (open to improvements) | [AnimateDiff Image To Video Pipeline](#animatediff-image-to-video-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://drive.google.com/file/d/1TvzCDPHhfFtdcJZe4RLloAwyoLKuttWK/view?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
 |   IP Adapter FaceID Stable Diffusion                                                                                               | Stable Diffusion Pipeline that supports IP Adapter Face ID                                                                                                                                                                                                                                                                                                                                                  |  [IP Adapter Face ID](#ip-adapter-face-id) | - | [Fabio Rigano](https://github.com/fabiorigano) |
 |   InstantID Pipeline                                                                                               | Stable Diffusion XL Pipeline that supports InstantID                                                                                                                                                                                                                                                                                                                                                 |  [InstantID Pipeline](#instantid-pipeline) | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/InstantX/InstantID) | [Haofan Wang](https://github.com/haofanwang) |
 |   UFOGen Scheduler                                                                                               | Scheduler for UFOGen Model (compatible with Stable Diffusion pipelines)                                                                                                                                                                                                                                                                                                                                                 |  [UFOGen Scheduler](#ufogen-scheduler) | - | [dg845](https://github.com/dg845) |
-| Stable Diffusion XL IPEX Pipeline | Accelerate Stable Diffusion XL inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [Stable Diffusion XL on IPEX](#stable-diffusion-xl-on-ipex) | - | [Dan Li](https://github.com/ustcuna/) |

 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.

@@ -1708,111 +1707,6 @@ print("Latency of StableDiffusionPipeline--fp32",latency)

 ```

-### Stable Diffusion XL on IPEX
-
-This diffusion pipeline aims to accelarate the inference of Stable-Diffusion XL on Intel Xeon CPUs with BF16/FP32 precision using [IPEX](https://github.com/intel/intel-extension-for-pytorch).
-
-To use this pipeline, you need to:
-1. Install [IPEX](https://github.com/intel/intel-extension-for-pytorch)
-
-**Note:** For each PyTorch release, there is a corresponding release of IPEX. Here is the mapping relationship. It is recommended to install Pytorch/IPEX2.0 to get the best performance.
-
-|PyTorch Version|IPEX Version|
-|--|--|
-|[v2.0.\*](https://github.com/pytorch/pytorch/tree/v2.0.1 "v2.0.1")|[v2.0.\*](https://github.com/intel/intel-extension-for-pytorch/tree/v2.0.100+cpu)|
-|[v1.13.\*](https://github.com/pytorch/pytorch/tree/v1.13.0 "v1.13.0")|[v1.13.\*](https://github.com/intel/intel-extension-for-pytorch/tree/v1.13.100+cpu)|
-
-You can simply use pip to install IPEX with the latest version.
-```python
-python -m pip install intel_extension_for_pytorch
-```
-**Note:** To install a specific version, run with the following command:
-```
-python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
-```
-
-2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX accelaration. Supported inference datatypes are Float32 and BFloat16.
-
-**Note:** The values of `height` and `width` used during preparation with `prepare_for_ipex()` should be the same when running inference with the prepared pipeline.
-
-```python
-pipe = StableDiffusionXLPipelineIpex.from_pretrained("stabilityai/sdxl-turbo", low_cpu_mem_usage=True, use_safetensors=True)
-# value of image height/width should be consistent with the pipeline inference
-# For Float32
-pipe.prepare_for_ipex(torch.float32, prompt, height=512, width=512)
-# For BFloat16
-pipe.prepare_for_ipex(torch.bfloat16, prompt, height=512, width=512)
-```
-
-Then you can use the ipex pipeline in a similar way to the default stable diffusion xl pipeline.
-```python
-# value of image height/width should be consistent with 'prepare_for_ipex()'
-# For Float32
-image = pipe(prompt, num_inference_steps=num_inference_steps, height=512, width=512, guidance_scale=guidance_scale).images[0]
-# For BFloat16
-with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
-    image = pipe(prompt, num_inference_steps=num_inference_steps, height=512, width=512, guidance_scale=guidance_scale).images[0]
-```
-
-The following code compares the performance of the original stable diffusion xl pipeline with the ipex-optimized pipeline.
-By using this optimized pipeline, we can get about 1.4-2 times performance boost with BFloat16 on fourth generation of Intel Xeon CPUs, 
-code-named Sapphire Rapids.
-
-```python
-import torch
-from diffusers import StableDiffusionXLPipeline
-from pipeline_stable_diffusion_xl_ipex import StableDiffusionXLPipelineIpex
-import time
-
-prompt = "sailing ship in storm by Rembrandt"
-model_id = "stabilityai/sdxl-turbo"
-steps = 4
-
-# Helper function for time evaluation
-def elapsed_time(pipeline, nb_pass=3, num_inference_steps=1):
-    # warmup
-    for _ in range(2):
-        images = pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512, guidance_scale=0.0).images
-    #time evaluation
-    start = time.time()
-    for _ in range(nb_pass):
-        pipeline(prompt, num_inference_steps=num_inference_steps, height=512, width=512, guidance_scale=0.0)
-    end = time.time()
-    return (end - start) / nb_pass
-
-##############     bf16 inference performance    ###############
-
-# 1. IPEX Pipeline initialization
-pipe = StableDiffusionXLPipelineIpex.from_pretrained(model_id, low_cpu_mem_usage=True, use_safetensors=True)
-pipe.prepare_for_ipex(torch.bfloat16, prompt, height=512, width=512)
-
-# 2. Original Pipeline initialization
-pipe2 = StableDiffusionXLPipeline.from_pretrained(model_id, low_cpu_mem_usage=True, use_safetensors=True)
-
-# 3. Compare performance between Original Pipeline and IPEX Pipeline
-with torch.cpu.amp.autocast(enabled=True, dtype=torch.bfloat16):
-    latency = elapsed_time(pipe, num_inference_steps=steps)
-    print("Latency of StableDiffusionXLPipelineIpex--bf16", latency, "s for total", steps, "steps")
-    latency = elapsed_time(pipe2, num_inference_steps=steps)
-    print("Latency of StableDiffusionXLPipeline--bf16", latency, "s for total", steps, "steps")
-
-##############     fp32 inference performance    ###############
-
-# 1. IPEX Pipeline initialization
-pipe3 = StableDiffusionXLPipelineIpex.from_pretrained(model_id, low_cpu_mem_usage=True, use_safetensors=True)
-pipe3.prepare_for_ipex(torch.float32, prompt, height=512, width=512)
-
-# 2. Original Pipeline initialization
-pipe4 = StableDiffusionXLPipeline.from_pretrained(model_id, low_cpu_mem_usage=True, use_safetensors=True)
-
-# 3. Compare performance between Original Pipeline and IPEX Pipeline
-latency = elapsed_time(pipe3, num_inference_steps=steps)
-print("Latency of StableDiffusionXLPipelineIpex--fp32", latency, "s for total", steps, "steps")
-latency = elapsed_time(pipe4, num_inference_steps=steps)
-print("Latency of StableDiffusionXLPipeline--fp32",latency, "s for total", steps, "steps")
-
-```
-
 ### CLIP Guided Images Mixing With Stable Diffusion

 ![clip_guided_images_mixing_examples](https://huggingface.co/datasets/TheDenk/images_mixing/resolve/main/main.png)
@@ -3412,9 +3306,10 @@ inverted_latent, uncond = pipeline.invert(input_image, invert_prompt, num_inner_
 pipeline(prompt, uncond, inverted_latent, guidance_scale=7.5, num_inference_steps=steps).images[0].save(input_image+".output.jpg")
 ```

-### Rerender A Video
+### Rerender_A_Video

-This is the Diffusers implementation of zero-shot video-to-video translation pipeline [Rerender A Video](https://github.com/williamyang1991/Rerender_A_Video) (without Ebsynth postprocessing). To run the code, please install gmflow. Then modify the path in `examples/community/rerender_a_video.py`:
+```
+This is the Diffusers implementation of zero-shot video-to-video translation pipeline [Rerender_A_Video](https://github.com/williamyang1991/Rerender_A_Video) (without Ebsynth postprocessing). To run the code, please install gmflow. Then modify the path in `examples/community/rerender_a_video.py`:

 ```py
 gmflow_dir = "/path/to/gmflow"
@@ -81,8 +81,6 @@ class CheckpointMergerPipeline(DiffusionPipeline):

                force - Whether to ignore mismatch in model_config.json for the current models. Defaults to False.

-                variant - which variant of a pretrained model to load, e.g. "fp16" (None)
-
        """
        # Default kwargs from DiffusionPipeline
        cache_dir = kwargs.pop("cache_dir", None)
@@ -91,7 +89,6 @@ class CheckpointMergerPipeline(DiffusionPipeline):
        proxies = kwargs.pop("proxies", None)
        local_files_only = kwargs.pop("local_files_only", False)
        token = kwargs.pop("token", None)
-        variant = kwargs.pop("variant", None)
        revision = kwargs.pop("revision", None)
        torch_dtype = kwargs.pop("torch_dtype", None)
        device_map = kwargs.pop("device_map", None)
@@ -176,10 +173,7 @@ class CheckpointMergerPipeline(DiffusionPipeline):
        # Step 3:-
        # Load the first checkpoint as a diffusion pipeline and modify its module state_dict in place
        final_pipe = DiffusionPipeline.from_pretrained(
-            cached_folders[0],
-            torch_dtype=torch_dtype,
-            device_map=device_map,
-            variant=variant,
+            cached_folders[0], torch_dtype=torch_dtype, device_map=device_map
        )
        final_pipe.to(self.device)

@@ -4,7 +4,7 @@ The `train_text_to_image.py` script shows how to fine-tune stable diffusion mode

 ___Note___:

-___This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparameters to get the best result on your dataset.___
+___This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparamters to get the best result on your dataset.___


 ## Running locally with PyTorch
@@ -2,7 +2,7 @@

 The `train_text_to_image_sdxl.py` script shows how to fine-tune Stable Diffusion XL (SDXL) on your own dataset.

-🚨 This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparameters to get the best result on your dataset. 🚨
+🚨 This script is experimental. The script fine-tunes the whole model and often times the model overfits and runs into issues like catastrophic forgetting. It's recommended to try different hyperparamters to get the best result on your dataset. 🚨

 ## Running locally with PyTorch

@@ -238,8 +238,8 @@ accelerate launch  --config_file $ACCELERATE_CONFIG_FILE train_text_to_image_lor
  --validation_epochs=20 \
  --seed=1234 \
  --output_dir="sd-pokemon-model-lora-sdxl" \
-  --validation_prompt="cute dragon creature"
-
+  --validation_prompt="cute dragon creature" 
+  
 ```


@@ -1,6 +1,5 @@
-#!/usr/bin/env python
 # coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,5 @@
-#!/usr/bin/env python
 # coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 HuggingFace Inc.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,7 +12,6 @@
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
-# limitations under the License.

 import argparse
 import logging
@@ -396,7 +395,7 @@ def parse_args():
        "--prediction_type",
        type=str,
        default=None,
-        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.",
    )
    parser.add_argument(
        "--hub_model_id",
@@ -636,7 +635,7 @@ def main():
                ema_unet.to(accelerator.device)
                del load_model

-            for _ in range(len(models)):
+            for i in range(len(models)):
                # pop models so that they are not loaded again
                model = models.pop()

@@ -811,7 +810,7 @@ def main():
    if args.use_ema:
        ema_unet.to(accelerator.device)

-    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
@@ -1,19 +1,3 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import argparse
 import logging
 import math
@@ -1,4 +1,3 @@
-#!/usr/bin/env python
 # coding=utf-8
 # Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
@@ -294,7 +293,7 @@ def parse_args():
        "--prediction_type",
        type=str,
        default=None,
-        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.",
    )
    parser.add_argument(
        "--hub_model_id",
@@ -455,7 +454,7 @@ def main():
    vae.requires_grad_(False)
    text_encoder.requires_grad_(False)

-    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
@@ -370,7 +370,7 @@ def parse_args(input_args=None):
        "--prediction_type",
        type=str,
        default=None,
-        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.",
    )
    parser.add_argument(
        "--hub_model_id",
@@ -585,7 +585,7 @@ def main(args):
    text_encoder_two.requires_grad_(False)
    unet.requires_grad_(False)

-    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
@@ -648,7 +648,7 @@ def main(args):
    def save_model_hook(models, weights, output_dir):
        if accelerator.is_main_process:
            # there are only two options here. Either are just the unet attn processor layers
-            # or there are the unet and text encoder attn layers
+            # or there are the unet and text encoder atten layers
            unet_lora_layers_to_save = None
            text_encoder_one_lora_layers_to_save = None
            text_encoder_two_lora_layers_to_save = None
@@ -419,7 +419,7 @@ def parse_args(input_args=None):
        "--prediction_type",
        type=str,
        default=None,
-        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediction_type` is chosen.",
+        help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.",
    )
    parser.add_argument(
        "--hub_model_id",
@@ -683,7 +683,7 @@ def main(args):
    # Set unet as trainable.
    unet.train()

-    # For mixed precision training we cast all non-trainable weights to half-precision
+    # For mixed precision training we cast all non-trainable weigths to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
@@ -738,7 +738,7 @@ def main(args):
                ema_unet.to(accelerator.device)
                del load_model

-            for _ in range(len(models)):
+            for i in range(len(models)):
                # pop models so that they are not loaded again
                model = models.pop()

@@ -962,7 +962,7 @@ def main(args):
    if accelerator.is_main_process:
        accelerator.init_trackers("text2image-fine-tune-sdxl", config=vars(args))

-    # Function for unwrapping if torch.compile() was used in accelerate.
+    # Function for unwraping if torch.compile() was used in accelerate.
    def unwrap_model(model):
        model = accelerator.unwrap_model(model)
        model = model._orig_mod if is_compiled_module(model) else model
@@ -53,7 +53,6 @@ from diffusers import (
 )
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version, is_wandb_available
-from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
 from diffusers.utils.import_utils import is_xformers_available


@@ -85,30 +84,32 @@ check_min_version("0.27.0.dev0")
 logger = get_logger(__name__)


-def save_model_card(repo_id: str, images: list = None, base_model: str = None, repo_folder: str = None):
+def save_model_card(repo_id: str, images=None, base_model=str, repo_folder=None):
    img_str = ""
-    if images is not None:
-        for i, image in enumerate(images):
-            image.save(os.path.join(repo_folder, f"image_{i}.png"))
-            img_str += f"![img_{i}](./image_{i}.png)\n"
-    model_description = f"""
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"![img_{i}](./image_{i}.png)\n"
+
+    yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+- diffusers
+- textual_inversion
+inference: true
+---
+    """
+    model_card = f"""
 # Textual inversion text2image fine-tuning - {repo_id}
 These are textual inversion adaption weights for {base_model}. You can find some example images in the following. \n
 {img_str}
 """
-    model_card = load_or_create_model_card(
-        repo_id_or_path=repo_id,
-        from_training=True,
-        license="creativeml-openrail-m",
-        base_model=base_model,
-        model_description=model_description,
-        inference=True,
-    )
-
-    tags = ["stable-diffusion", "stable-diffusion-diffusers", "text-to-image", "diffusers", "textual_inversion"]
-    model_card = populate_model_card(model_card, tags=tags)
-
-    model_card.save(os.path.join(repo_folder, "README.md"))
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)


 def log_validation(text_encoder, tokenizer, unet, vae, args, accelerator, weight_dtype, epoch):
@@ -32,6 +32,8 @@ from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import ProjectConfiguration, set_seed
 from huggingface_hub import create_repo, upload_folder
+
+# TODO: remove and import from diffusers.utils when the new version of diffusers is released
 from packaging import version
 from PIL import Image
 from torch.utils.data import Dataset
@@ -49,7 +51,6 @@ from diffusers import (
 )
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version, is_wandb_available
-from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
 from diffusers.utils.import_utils import is_xformers_available


@@ -87,31 +88,26 @@ def save_model_card(repo_id: str, images=None, base_model=str, repo_folder=None)
        image.save(os.path.join(repo_folder, f"image_{i}.png"))
        img_str += f"![img_{i}](./image_{i}.png)\n"

-    model_description = f"""
+    yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+- diffusers
+- textual_inversion
+inference: true
+---
+    """
+    model_card = f"""
 # Textual inversion text2image fine-tuning - {repo_id}
 These are textual inversion adaption weights for {base_model}. You can find some example images in the following. \n
 {img_str}
 """
-    model_card = load_or_create_model_card(
-        repo_id_or_path=repo_id,
-        from_training=True,
-        license="creativeml-openrail-m",
-        base_model=base_model,
-        model_description=model_description,
-        inference=True,
-    )
-
-    tags = [
-        "stable-diffusion-xl",
-        "stable-diffusion-xl-diffusers",
-        "text-to-image",
-        "diffusers",
-        "textual_inversion",
-    ]
-
-    model_card = populate_model_card(model_card, tags=tags)
-
-    model_card.save(os.path.join(repo_folder, "README.md"))
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)


 def log_validation(
@@ -48,4 +48,4 @@ if __name__ == "__main__":
    # skip loading position embeddings
    adapter.load_state_dict(conv_state_dict, strict=False)
    adapter.save_pretrained(args.output_path)
-    adapter.save_pretrained(args.output_path, variant="fp16", torch_dtype=torch.float16)
+    adapter.to(torch.float16).save_pretrained(args.output_path, variant="fp16")
@@ -4,7 +4,6 @@ import math
 import os
 from copy import deepcopy

-import requests
 import torch
 from audio_diffusion.models import DiffusionAttnUnet1D
 from diffusion import sampling
@@ -74,14 +73,9 @@ class DiffusionUncond(nn.Module):

 def download(model_name):
    url = MODELS_MAP[model_name]["url"]
-    r = requests.get(url, stream=True)
+    os.system(f"wget {url} ./")

-    local_filename = f"./{model_name}.ckpt"
-    with open(local_filename, "wb") as fp:
-        for chunk in r.iter_content(chunk_size=8192):
-            fp.write(chunk)
-
-    return local_filename
+    return f"./{model_name}.ckpt"


 DOWN_NUM_TO_LAYER = {
@@ -12,14 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import math
 import warnings
 from typing import List, Optional, Tuple, Union

 import numpy as np
 import PIL.Image
 import torch
-import torch.nn.functional as F
 from PIL import Image, ImageFilter, ImageOps

 from .configuration_utils import ConfigMixin, register_to_config
@@ -884,107 +882,3 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
            depth = self.binarize(depth)

        return rgb, depth
-
-
-class IPAdapterMaskProcessor(VaeImageProcessor):
-    """
-    Image processor for IP Adapter image masks.
-
-    Args:
-        do_resize (`bool`, *optional*, defaults to `True`):
-            Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`.
-        vae_scale_factor (`int`, *optional*, defaults to `8`):
-            VAE scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of this factor.
-        resample (`str`, *optional*, defaults to `lanczos`):
-            Resampling filter to use when resizing the image.
-        do_normalize (`bool`, *optional*, defaults to `False`):
-            Whether to normalize the image to [-1,1].
-        do_binarize (`bool`, *optional*, defaults to `True`):
-            Whether to binarize the image to 0/1.
-        do_convert_grayscale (`bool`, *optional*, defaults to be `True`):
-            Whether to convert the images to grayscale format.
-
-    """
-
-    config_name = CONFIG_NAME
-
-    @register_to_config
-    def __init__(
-        self,
-        do_resize: bool = True,
-        vae_scale_factor: int = 8,
-        resample: str = "lanczos",
-        do_normalize: bool = False,
-        do_binarize: bool = True,
-        do_convert_grayscale: bool = True,
-    ):
-        super().__init__(
-            do_resize=do_resize,
-            vae_scale_factor=vae_scale_factor,
-            resample=resample,
-            do_normalize=do_normalize,
-            do_binarize=do_binarize,
-            do_convert_grayscale=do_convert_grayscale,
-        )
-
-    @staticmethod
-    def downsample(mask: torch.FloatTensor, batch_size: int, num_queries: int, value_embed_dim: int):
-        """
-        Downsamples the provided mask tensor to match the expected dimensions for scaled dot-product attention.
-        If the aspect ratio of the mask does not match the aspect ratio of the output image, a warning is issued.
-
-        Args:
-            mask (`torch.FloatTensor`):
-                The input mask tensor generated with `IPAdapterMaskProcessor.preprocess()`.
-            batch_size (`int`):
-                The batch size.
-            num_queries (`int`):
-                The number of queries.
-            value_embed_dim (`int`):
-                The dimensionality of the value embeddings.
-
-        Returns:
-            `torch.FloatTensor`:
-                The downsampled mask tensor.
-
-        """
-        o_h = mask.shape[1]
-        o_w = mask.shape[2]
-        ratio = o_w / o_h
-        mask_h = int(math.sqrt(num_queries / ratio))
-        mask_h = int(mask_h) + int((num_queries % int(mask_h)) != 0)
-        mask_w = num_queries // mask_h
-
-        mask_downsample = F.interpolate(mask.unsqueeze(0), size=(mask_h, mask_w), mode="bicubic").squeeze(0)
-
-        # Repeat batch_size times
-        if mask_downsample.shape[0] < batch_size:
-            mask_downsample = mask_downsample.repeat(batch_size, 1, 1)
-
-        mask_downsample = mask_downsample.view(mask_downsample.shape[0], -1)
-
-        downsampled_area = mask_h * mask_w
-        # If the output image and the mask do not have the same aspect ratio, tensor shapes will not match
-        # Pad tensor if downsampled_mask.shape[1] is smaller than num_queries
-        if downsampled_area < num_queries:
-            warnings.warn(
-                "The aspect ratio of the mask does not match the aspect ratio of the output image. "
-                "Please update your masks or adjust the output size for optimal performance.",
-                UserWarning,
-            )
-            mask_downsample = F.pad(mask_downsample, (0, num_queries - mask_downsample.shape[1]), value=0.0)
-        # Discard last embeddings if downsampled_mask.shape[1] is bigger than num_queries
-        if downsampled_area > num_queries:
-            warnings.warn(
-                "The aspect ratio of the mask does not match the aspect ratio of the output image. "
-                "Please update your masks or adjust the output size for optimal performance.",
-                UserWarning,
-            )
-            mask_downsample = mask_downsample[:, :num_queries]
-
-        # Repeat last dimension to match SDPA output shape
-        mask_downsample = mask_downsample.view(mask_downsample.shape[0], mask_downsample.shape[1], 1).repeat(
-            1, 1, value_embed_dim
-        )
-
-        return mask_downsample
@@ -75,6 +75,10 @@ class FromOriginalVAEMixin:
                diffusion model. When decoding, the latents are scaled back to the original scale with the formula: `z
                = 1 / scaling_factor * z`. For more details, refer to sections 4.3.2 and D.1 of the [High-Resolution
                Image Synthesis with Latent Diffusion Models](https://arxiv.org/abs/2112.10752) paper.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
            kwargs (remaining dictionary of keyword arguments, *optional*):
                Can be used to overwrite load and saveable variables (for example the pipeline components of the
                specific pipeline class). The overwritten components are directly passed to the pipelines `__init__`
@@ -107,6 +111,7 @@ class FromOriginalVAEMixin:
        local_files_only = kwargs.pop("local_files_only", None)
        revision = kwargs.pop("revision", None)
        torch_dtype = kwargs.pop("torch_dtype", None)
+        use_safetensors = kwargs.pop("use_safetensors", True)

        class_name = cls.__name__

@@ -126,18 +131,14 @@ class FromOriginalVAEMixin:
            token=token,
            revision=revision,
            local_files_only=local_files_only,
+            use_safetensors=use_safetensors,
            cache_dir=cache_dir,
        )

        image_size = kwargs.pop("image_size", None)
        scaling_factor = kwargs.pop("scaling_factor", None)
        component = create_diffusers_vae_model_from_ldm(
-            class_name,
-            original_config,
-            checkpoint,
-            image_size=image_size,
-            scaling_factor=scaling_factor,
-            torch_dtype=torch_dtype,
+            class_name, original_config, checkpoint, image_size=image_size, scaling_factor=scaling_factor
        )
        vae = component["vae"]
        if torch_dtype is not None:
@@ -65,6 +65,10 @@ class FromOriginalControlNetMixin:
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
                allowed by Git.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
            image_size (`int`, *optional*, defaults to 512):
                The image size the model was trained on. Use 512 for all Stable Diffusion v1 models and the Stable
                Diffusion v2 base model. Use 768 for Stable Diffusion v2.
@@ -97,6 +101,7 @@ class FromOriginalControlNetMixin:
        local_files_only = kwargs.pop("local_files_only", None)
        revision = kwargs.pop("revision", None)
        torch_dtype = kwargs.pop("torch_dtype", None)
+        use_safetensors = kwargs.pop("use_safetensors", True)

        class_name = cls.__name__
        if (config_file is not None) and (original_config_file is not None):
@@ -115,6 +120,7 @@ class FromOriginalControlNetMixin:
            token=token,
            revision=revision,
            local_files_only=local_files_only,
+            use_safetensors=use_safetensors,
            cache_dir=cache_dir,
        )

@@ -122,12 +128,7 @@ class FromOriginalControlNetMixin:
        image_size = kwargs.pop("image_size", None)

        component = create_diffusers_controlnet_model_from_ldm(
-            class_name,
-            original_config,
-            checkpoint,
-            upcast_attention=upcast_attention,
-            image_size=image_size,
-            torch_dtype=torch_dtype,
+            class_name, original_config, checkpoint, upcast_attention=upcast_attention, image_size=image_size
        )
        controlnet = component["controlnet"]
        if torch_dtype is not None:
@@ -57,19 +57,14 @@ def build_sub_model_components(
    if component_name == "unet":
        num_in_channels = kwargs.pop("num_in_channels", None)
        unet_components = create_diffusers_unet_model_from_ldm(
-            pipeline_class_name,
-            original_config,
-            checkpoint,
-            num_in_channels=num_in_channels,
-            image_size=image_size,
-            torch_dtype=torch_dtype,
+            pipeline_class_name, original_config, checkpoint, num_in_channels=num_in_channels, image_size=image_size
        )
        return unet_components

    if component_name == "vae":
        scaling_factor = kwargs.get("scaling_factor", None)
        vae_components = create_diffusers_vae_model_from_ldm(
-            pipeline_class_name, original_config, checkpoint, image_size, scaling_factor, torch_dtype
+            pipeline_class_name, original_config, checkpoint, image_size, scaling_factor
        )
        return vae_components

@@ -94,7 +89,6 @@ def build_sub_model_components(
            checkpoint,
            model_type=model_type,
            local_files_only=local_files_only,
-            torch_dtype=torch_dtype,
        )
        return text_encoder_components

@@ -181,6 +175,10 @@ class FromSingleFileMixin:
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, a commit id, or any identifier
                allowed by Git.
+            use_safetensors (`bool`, *optional*, defaults to `None`):
+                If set to `None`, the safetensors weights are downloaded if they're available **and** if the
+                safetensors library is installed. If set to `True`, the model is forcibly loaded from safetensors
+                weights. If set to `False`, safetensors weights are not loaded.
        Examples:

        ```py
@@ -212,6 +210,7 @@ class FromSingleFileMixin:
        local_files_only = kwargs.pop("local_files_only", False)
        revision = kwargs.pop("revision", None)
        torch_dtype = kwargs.pop("torch_dtype", None)
+        use_safetensors = kwargs.pop("use_safetensors", True)

        class_name = cls.__name__

@@ -225,6 +224,7 @@ class FromSingleFileMixin:
            token=token,
            revision=revision,
            local_files_only=local_files_only,
+            use_safetensors=use_safetensors,
            cache_dir=cache_dir,
        )

@@ -261,7 +261,6 @@ class FromSingleFileMixin:
                    image_size=image_size,
                    load_safety_checker=load_safety_checker,
                    local_files_only=local_files_only,
-                    torch_dtype=torch_dtype,
                    **kwargs,
                )
                if not components:
@@ -227,7 +227,14 @@ def fetch_ldm_config_and_checkpoint(
    cache_dir=None,
    local_files_only=None,
    revision=None,
+    use_safetensors=True,
 ):
+    file_extension = pretrained_model_link_or_path.rsplit(".", 1)[-1]
+    from_safetensors = file_extension == "safetensors"
+
+    if from_safetensors and use_safetensors is False:
+        raise ValueError("Make sure to install `safetensors` with `pip install safetensors`.")
+
    if os.path.isfile(pretrained_model_link_or_path):
        checkpoint = load_state_dict(pretrained_model_link_or_path)

@@ -849,7 +856,7 @@ def convert_controlnet_checkpoint(


 def create_diffusers_controlnet_model_from_ldm(
-    pipeline_class_name, original_config, checkpoint, upcast_attention=False, image_size=None, torch_dtype=None
+    pipeline_class_name, original_config, checkpoint, upcast_attention=False, image_size=None
 ):
    # import here to avoid circular imports
    from ..models import ControlNetModel
@@ -868,9 +875,7 @@ def create_diffusers_controlnet_model_from_ldm(
    if is_accelerate_available():
        from ..models.modeling_utils import load_model_dict_into_meta

-        unexpected_keys = load_model_dict_into_meta(
-            controlnet, diffusers_format_controlnet_checkpoint, dtype=torch_dtype
-        )
+        unexpected_keys = load_model_dict_into_meta(controlnet, diffusers_format_controlnet_checkpoint)
        if controlnet._keys_to_ignore_on_load_unexpected is not None:
            for pat in controlnet._keys_to_ignore_on_load_unexpected:
                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
@@ -882,9 +887,6 @@ def create_diffusers_controlnet_model_from_ldm(
    else:
        controlnet.load_state_dict(diffusers_format_controlnet_checkpoint)

-    if torch_dtype is not None:
-        controlnet = controlnet.to(torch_dtype)
-
    return {"controlnet": controlnet}


@@ -1020,7 +1022,7 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
    return new_checkpoint


-def create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_files_only=False, torch_dtype=None):
+def create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_files_only=False):
    try:
        config = CLIPTextConfig.from_pretrained(config_name, local_files_only=local_files_only)
    except Exception:
@@ -1046,7 +1048,7 @@ def create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_
    if is_accelerate_available():
        from ..models.modeling_utils import load_model_dict_into_meta

-        unexpected_keys = load_model_dict_into_meta(text_model, text_model_dict, dtype=torch_dtype)
+        unexpected_keys = load_model_dict_into_meta(text_model, text_model_dict)
        if text_model._keys_to_ignore_on_load_unexpected is not None:
            for pat in text_model._keys_to_ignore_on_load_unexpected:
                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
@@ -1061,9 +1063,6 @@ def create_text_encoder_from_ldm_clip_checkpoint(config_name, checkpoint, local_

        text_model.load_state_dict(text_model_dict)

-    if torch_dtype is not None:
-        text_model = text_model.to(torch_dtype)
-
    return text_model


@@ -1073,7 +1072,6 @@ def create_text_encoder_from_open_clip_checkpoint(
    prefix="cond_stage_model.model.",
    has_projection=False,
    local_files_only=False,
-    torch_dtype=None,
    **config_kwargs,
 ):
    try:
@@ -1141,7 +1139,7 @@ def create_text_encoder_from_open_clip_checkpoint(
    if is_accelerate_available():
        from ..models.modeling_utils import load_model_dict_into_meta

-        unexpected_keys = load_model_dict_into_meta(text_model, text_model_dict, dtype=torch_dtype)
+        unexpected_keys = load_model_dict_into_meta(text_model, text_model_dict)
        if text_model._keys_to_ignore_on_load_unexpected is not None:
            for pat in text_model._keys_to_ignore_on_load_unexpected:
                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
@@ -1157,9 +1155,6 @@ def create_text_encoder_from_open_clip_checkpoint(

        text_model.load_state_dict(text_model_dict)

-    if torch_dtype is not None:
-        text_model = text_model.to(torch_dtype)
-
    return text_model


@@ -1171,7 +1166,6 @@ def create_diffusers_unet_model_from_ldm(
    upcast_attention=False,
    extract_ema=False,
    image_size=None,
-    torch_dtype=None,
 ):
    from ..models import UNet2DConditionModel

@@ -1204,7 +1198,7 @@ def create_diffusers_unet_model_from_ldm(
    if is_accelerate_available():
        from ..models.modeling_utils import load_model_dict_into_meta

-        unexpected_keys = load_model_dict_into_meta(unet, diffusers_format_unet_checkpoint, dtype=torch_dtype)
+        unexpected_keys = load_model_dict_into_meta(unet, diffusers_format_unet_checkpoint)
        if unet._keys_to_ignore_on_load_unexpected is not None:
            for pat in unet._keys_to_ignore_on_load_unexpected:
                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
@@ -1216,14 +1210,11 @@ def create_diffusers_unet_model_from_ldm(
    else:
        unet.load_state_dict(diffusers_format_unet_checkpoint)

-    if torch_dtype is not None:
-        unet = unet.to(torch_dtype)
-
    return {"unet": unet}


 def create_diffusers_vae_model_from_ldm(
-    pipeline_class_name, original_config, checkpoint, image_size=None, scaling_factor=None, torch_dtype=None
+    pipeline_class_name, original_config, checkpoint, image_size=None, scaling_factor=None
 ):
    # import here to avoid circular imports
    from ..models import AutoencoderKL
@@ -1240,7 +1231,7 @@ def create_diffusers_vae_model_from_ldm(
    if is_accelerate_available():
        from ..models.modeling_utils import load_model_dict_into_meta

-        unexpected_keys = load_model_dict_into_meta(vae, diffusers_format_vae_checkpoint, dtype=torch_dtype)
+        unexpected_keys = load_model_dict_into_meta(vae, diffusers_format_vae_checkpoint)
        if vae._keys_to_ignore_on_load_unexpected is not None:
            for pat in vae._keys_to_ignore_on_load_unexpected:
                unexpected_keys = [k for k in unexpected_keys if re.search(pat, k) is None]
@@ -1252,9 +1243,6 @@ def create_diffusers_vae_model_from_ldm(
    else:
        vae.load_state_dict(diffusers_format_vae_checkpoint)

-    if torch_dtype is not None:
-        vae = vae.to(torch_dtype)
-
    return {"vae": vae}


@@ -1263,7 +1251,6 @@ def create_text_encoders_and_tokenizers_from_ldm(
    checkpoint,
    model_type=None,
    local_files_only=False,
-    torch_dtype=None,
 ):
    model_type = infer_model_type(original_config, model_type=model_type)

@@ -1273,7 +1260,7 @@ def create_text_encoders_and_tokenizers_from_ldm(

        try:
            text_encoder = create_text_encoder_from_open_clip_checkpoint(
-                config_name, checkpoint, local_files_only=local_files_only, torch_dtype=torch_dtype, **config_kwargs
+                config_name, checkpoint, local_files_only=local_files_only, **config_kwargs
            )
            tokenizer = CLIPTokenizer.from_pretrained(
                config_name, subfolder="tokenizer", local_files_only=local_files_only
@@ -1292,7 +1279,6 @@ def create_text_encoders_and_tokenizers_from_ldm(
                config_name,
                checkpoint,
                local_files_only=local_files_only,
-                torch_dtype=torch_dtype,
            )
            tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)

@@ -1316,7 +1302,6 @@ def create_text_encoders_and_tokenizers_from_ldm(
                prefix=prefix,
                has_projection=True,
                local_files_only=local_files_only,
-                torch_dtype=torch_dtype,
                **config_kwargs,
            )
        except Exception:
@@ -1337,7 +1322,7 @@ def create_text_encoders_and_tokenizers_from_ldm(
            config_name = "openai/clip-vit-large-patch14"
            tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
            text_encoder = create_text_encoder_from_ldm_clip_checkpoint(
-                config_name, checkpoint, local_files_only=local_files_only, torch_dtype=torch_dtype
+                config_name, checkpoint, local_files_only=local_files_only
            )

        except Exception:
@@ -1356,7 +1341,6 @@ def create_text_encoders_and_tokenizers_from_ldm(
                prefix=prefix,
                has_projection=True,
                local_files_only=local_files_only,
-                torch_dtype=torch_dtype,
                **config_kwargs,
            )
        except Exception:
@@ -457,8 +457,6 @@ class TextualInversionLoaderMixin:
    def unload_textual_inversion(
        self,
        tokens: Optional[Union[str, List[str]]] = None,
-        tokenizer: Optional["PreTrainedTokenizer"] = None,
-        text_encoder: Optional["PreTrainedModel"] = None,
    ):
        r"""
        Unload Textual Inversion embeddings from the text encoder of [`StableDiffusionPipeline`]
@@ -483,28 +481,11 @@ class TextualInversionLoaderMixin:

        # Remove just one token
        pipeline.unload_textual_inversion("<moe-bius>")
-
-        # Example 3: unload from SDXL
-        pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0")
-        embedding_path = hf_hub_download(repo_id="linoyts/web_y2k", filename="web_y2k_emb.safetensors", repo_type="model")
-
-        # load embeddings to the text encoders
-        state_dict = load_file(embedding_path)
-
-        # load embeddings of text_encoder 1 (CLIP ViT-L/14)
-        pipeline.load_textual_inversion(state_dict["clip_l"], token=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer)
-        # load embeddings of text_encoder 2 (CLIP ViT-G/14)
-        pipeline.load_textual_inversion(state_dict["clip_g"], token=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2)
-
-        # Unload explicitly from both text encoders abd tokenizers
-        pipeline.unload_textual_inversion(tokens=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder, tokenizer=pipeline.tokenizer)
-        pipeline.unload_textual_inversion(tokens=["<s0>", "<s1>"], text_encoder=pipeline.text_encoder_2, tokenizer=pipeline.tokenizer_2)
-
        ```
        """

-        tokenizer = tokenizer or getattr(self, "tokenizer", None)
-        text_encoder = text_encoder or getattr(self, "text_encoder", None)
+        tokenizer = getattr(self, "tokenizer", None)
+        text_encoder = getattr(self, "text_encoder", None)

        # Get textual inversion tokens and ids
        token_ids = []
@@ -19,7 +19,6 @@ import torch
 import torch.nn.functional as F
 from torch import nn

-from ..image_processor import IPAdapterMaskProcessor
 from ..utils import USE_PEFT_BACKEND, deprecate, logging
 from ..utils.import_utils import is_xformers_available
 from ..utils.torch_utils import maybe_allow_in_graph
@@ -1810,7 +1809,24 @@ class SpatialNorm(nn.Module):
        return new_f


+## Deprecated
 class LoRAAttnProcessor(nn.Module):
+    r"""
+    Processor for implementing the LoRA attention mechanism.
+
+    Args:
+        hidden_size (`int`, *optional*):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+        kwargs (`dict`):
+            Additional keyword arguments to pass to the `LoRALinearLayer` layers.
+    """
+
    def __init__(
        self,
        hidden_size: int,
@@ -1819,9 +1835,6 @@ class LoRAAttnProcessor(nn.Module):
        network_alpha: Optional[int] = None,
        **kwargs,
    ):
-        deprecation_message = "Using LoRAAttnProcessor is deprecated. Please use the PEFT backend for all things LoRA. You can install PEFT by running `pip install peft`."
-        deprecate("LoRAAttnProcessor", "0.30.0", deprecation_message, standard_warn=False)
-
        super().__init__()

        self.hidden_size = hidden_size
@@ -1870,6 +1883,23 @@ class LoRAAttnProcessor(nn.Module):


 class LoRAAttnProcessor2_0(nn.Module):
+    r"""
+    Processor for implementing the LoRA attention mechanism using PyTorch 2.0's memory-efficient scaled dot-product
+    attention.
+
+    Args:
+        hidden_size (`int`):
+            The hidden size of the attention layer.
+        cross_attention_dim (`int`, *optional*):
+            The number of channels in the `encoder_hidden_states`.
+        rank (`int`, defaults to 4):
+            The dimension of the LoRA update matrices.
+        network_alpha (`int`, *optional*):
+            Equivalent to `alpha` but it's usage is specific to Kohya (A1111) style LoRAs.
+        kwargs (`dict`):
+            Additional keyword arguments to pass to the `LoRALinearLayer` layers.
+    """
+
    def __init__(
        self,
        hidden_size: int,
@@ -1878,9 +1908,6 @@ class LoRAAttnProcessor2_0(nn.Module):
        network_alpha: Optional[int] = None,
        **kwargs,
    ):
-        deprecation_message = "Using LoRAAttnProcessor is deprecated. Please use the PEFT backend for all things LoRA. You can install PEFT by running `pip install peft`."
-        deprecate("LoRAAttnProcessor2_0", "0.30.0", deprecation_message, standard_warn=False)
-
        super().__init__()
        if not hasattr(F, "scaled_dot_product_attention"):
            raise ImportError("AttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0.")
@@ -2108,13 +2135,12 @@ class IPAdapterAttnProcessor(nn.Module):

    def __call__(
        self,
-        attn: Attention,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        temb: Optional[torch.FloatTensor] = None,
-        scale: float = 1.0,
-        ip_adapter_masks: Optional[torch.FloatTensor] = None,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        scale=1.0,
    ):
        residual = hidden_states

@@ -2169,22 +2195,9 @@ class IPAdapterAttnProcessor(nn.Module):
        hidden_states = torch.bmm(attention_probs, value)
        hidden_states = attn.batch_to_head_dim(hidden_states)

-        if ip_adapter_masks is not None:
-            if not isinstance(ip_adapter_masks, torch.Tensor) or ip_adapter_masks.ndim != 4:
-                raise ValueError(
-                    " ip_adapter_mask should be a tensor with shape [num_ip_adapter, 1, height, width]."
-                    " Please use `IPAdapterMaskProcessor` to preprocess your mask"
-                )
-            if len(ip_adapter_masks) != len(self.scale):
-                raise ValueError(
-                    f"Number of ip_adapter_masks ({len(ip_adapter_masks)}) must match number of IP-Adapters ({len(self.scale)})"
-                )
-        else:
-            ip_adapter_masks = [None] * len(self.scale)
-
        # for ip-adapter
-        for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip(
-            ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks
+        for current_ip_hidden_states, scale, to_k_ip, to_v_ip in zip(
+            ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip
        ):
            ip_key = to_k_ip(current_ip_hidden_states)
            ip_value = to_v_ip(current_ip_hidden_states)
@@ -2196,15 +2209,6 @@ class IPAdapterAttnProcessor(nn.Module):
            current_ip_hidden_states = torch.bmm(ip_attention_probs, ip_value)
            current_ip_hidden_states = attn.batch_to_head_dim(current_ip_hidden_states)

-            if mask is not None:
-                mask_downsample = IPAdapterMaskProcessor.downsample(
-                    mask, batch_size, current_ip_hidden_states.shape[1], current_ip_hidden_states.shape[2]
-                )
-
-                mask_downsample = mask_downsample.to(dtype=query.dtype, device=query.device)
-
-                current_ip_hidden_states = current_ip_hidden_states * mask_downsample
-
            hidden_states = hidden_states + scale * current_ip_hidden_states

        # linear proj
@@ -2268,13 +2272,12 @@ class IPAdapterAttnProcessor2_0(torch.nn.Module):

    def __call__(
        self,
-        attn: Attention,
-        hidden_states: torch.FloatTensor,
-        encoder_hidden_states: Optional[torch.FloatTensor] = None,
-        attention_mask: Optional[torch.FloatTensor] = None,
-        temb: Optional[torch.FloatTensor] = None,
-        scale: float = 1.0,
-        ip_adapter_masks: Optional[torch.FloatTensor] = None,
+        attn,
+        hidden_states,
+        encoder_hidden_states=None,
+        attention_mask=None,
+        temb=None,
+        scale=1.0,
    ):
        residual = hidden_states

@@ -2343,22 +2346,9 @@ class IPAdapterAttnProcessor2_0(torch.nn.Module):
        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
        hidden_states = hidden_states.to(query.dtype)

-        if ip_adapter_masks is not None:
-            if not isinstance(ip_adapter_masks, torch.Tensor) or ip_adapter_masks.ndim != 4:
-                raise ValueError(
-                    " ip_adapter_mask should be a tensor with shape [num_ip_adapter, 1, height, width]."
-                    " Please use `IPAdapterMaskProcessor` to preprocess your mask"
-                )
-            if len(ip_adapter_masks) != len(self.scale):
-                raise ValueError(
-                    f"Number of ip_adapter_masks ({len(ip_adapter_masks)}) must match number of IP-Adapters ({len(self.scale)})"
-                )
-        else:
-            ip_adapter_masks = [None] * len(self.scale)
-
        # for ip-adapter
-        for current_ip_hidden_states, scale, to_k_ip, to_v_ip, mask in zip(
-            ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip, ip_adapter_masks
+        for current_ip_hidden_states, scale, to_k_ip, to_v_ip in zip(
+            ip_hidden_states, self.scale, self.to_k_ip, self.to_v_ip
        ):
            ip_key = to_k_ip(current_ip_hidden_states)
            ip_value = to_v_ip(current_ip_hidden_states)
@@ -2377,15 +2367,6 @@ class IPAdapterAttnProcessor2_0(torch.nn.Module):
            )
            current_ip_hidden_states = current_ip_hidden_states.to(query.dtype)

-            if mask is not None:
-                mask_downsample = IPAdapterMaskProcessor.downsample(
-                    mask, batch_size, current_ip_hidden_states.shape[1], current_ip_hidden_states.shape[2]
-                )
-
-                mask_downsample = mask_downsample.to(dtype=query.dtype, device=query.device)
-
-                current_ip_hidden_states = current_ip_hidden_states * mask_downsample
-
            hidden_states = hidden_states + scale * current_ip_hidden_states

        # linear proj
@@ -249,81 +249,6 @@ def get_down_block(
    raise ValueError(f"{down_block_type} does not exist.")


-def get_mid_block(
-    mid_block_type: str,
-    temb_channels: int,
-    in_channels: int,
-    resnet_eps: float,
-    resnet_act_fn: str,
-    resnet_groups: int,
-    output_scale_factor: float = 1.0,
-    transformer_layers_per_block: int = 1,
-    num_attention_heads: Optional[int] = None,
-    cross_attention_dim: Optional[int] = None,
-    dual_cross_attention: bool = False,
-    use_linear_projection: bool = False,
-    mid_block_only_cross_attention: bool = False,
-    upcast_attention: bool = False,
-    resnet_time_scale_shift: str = "default",
-    attention_type: str = "default",
-    resnet_skip_time_act: bool = False,
-    cross_attention_norm: Optional[str] = None,
-    attention_head_dim: Optional[int] = 1,
-    dropout: float = 0.0,
-):
-    if mid_block_type == "UNetMidBlock2DCrossAttn":
-        return UNetMidBlock2DCrossAttn(
-            transformer_layers_per_block=transformer_layers_per_block,
-            in_channels=in_channels,
-            temb_channels=temb_channels,
-            dropout=dropout,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            output_scale_factor=output_scale_factor,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            cross_attention_dim=cross_attention_dim,
-            num_attention_heads=num_attention_heads,
-            resnet_groups=resnet_groups,
-            dual_cross_attention=dual_cross_attention,
-            use_linear_projection=use_linear_projection,
-            upcast_attention=upcast_attention,
-            attention_type=attention_type,
-        )
-    elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
-        return UNetMidBlock2DSimpleCrossAttn(
-            in_channels=in_channels,
-            temb_channels=temb_channels,
-            dropout=dropout,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            output_scale_factor=output_scale_factor,
-            cross_attention_dim=cross_attention_dim,
-            attention_head_dim=attention_head_dim,
-            resnet_groups=resnet_groups,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            skip_time_act=resnet_skip_time_act,
-            only_cross_attention=mid_block_only_cross_attention,
-            cross_attention_norm=cross_attention_norm,
-        )
-    elif mid_block_type == "UNetMidBlock2D":
-        return UNetMidBlock2D(
-            in_channels=in_channels,
-            temb_channels=temb_channels,
-            dropout=dropout,
-            num_layers=0,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
-            output_scale_factor=output_scale_factor,
-            resnet_groups=resnet_groups,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            add_attention=False,
-        )
-    elif mid_block_type is None:
-        return None
-    else:
-        raise ValueError(f"unknown mid_block_type : {mid_block_type}")
-
-
 def get_up_block(
    up_block_type: str,
    num_layers: int,
@@ -44,8 +44,10 @@ from ..embeddings import (
 )
 from ..modeling_utils import ModelMixin
 from .unet_2d_blocks import (
+    UNetMidBlock2D,
+    UNetMidBlock2DCrossAttn,
+    UNetMidBlock2DSimpleCrossAttn,
    get_down_block,
-    get_mid_block,
    get_up_block,
 )

@@ -237,18 +239,44 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
        num_attention_heads = num_attention_heads or attention_head_dim

        # Check inputs
-        self._check_config(
-            down_block_types=down_block_types,
-            up_block_types=up_block_types,
-            only_cross_attention=only_cross_attention,
-            block_out_channels=block_out_channels,
-            layers_per_block=layers_per_block,
-            cross_attention_dim=cross_attention_dim,
-            transformer_layers_per_block=transformer_layers_per_block,
-            reverse_transformer_layers_per_block=reverse_transformer_layers_per_block,
-            attention_head_dim=attention_head_dim,
-            num_attention_heads=num_attention_heads,
-        )
+        if len(down_block_types) != len(up_block_types):
+            raise ValueError(
+                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
+            )
+
+        if len(block_out_channels) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+            raise ValueError(
+                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
+            )
+        if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
+            for layer_number_per_block in transformer_layers_per_block:
+                if isinstance(layer_number_per_block, list):
+                    raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")

        # input
        conv_in_padding = (conv_in_kernel - 1) // 2
@@ -257,13 +285,23 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
        )

        # time
-        time_embed_dim, timestep_input_dim = self._set_time_proj(
-            time_embedding_type,
-            block_out_channels=block_out_channels,
-            flip_sin_to_cos=flip_sin_to_cos,
-            freq_shift=freq_shift,
-            time_embedding_dim=time_embedding_dim,
-        )
+        if time_embedding_type == "fourier":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
+            if time_embed_dim % 2 != 0:
+                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
+            self.time_proj = GaussianFourierProjection(
+                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
+            )
+            timestep_input_dim = time_embed_dim
+        elif time_embedding_type == "positional":
+            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
+
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+        else:
+            raise ValueError(
+                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
+            )

        self.time_embedding = TimestepEmbedding(
            timestep_input_dim,
@@ -273,33 +311,96 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
            cond_proj_dim=time_cond_proj_dim,
        )

-        self._set_encoder_hid_proj(
-            encoder_hid_dim_type,
-            cross_attention_dim=cross_attention_dim,
-            encoder_hid_dim=encoder_hid_dim,
-        )
+        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
+            encoder_hid_dim_type = "text_proj"
+            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
+            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
+
+        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
+            )
+
+        if encoder_hid_dim_type == "text_proj":
+            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
+        elif encoder_hid_dim_type == "text_image_proj":
+            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
+            self.encoder_hid_proj = TextImageProjection(
+                text_embed_dim=encoder_hid_dim,
+                image_embed_dim=cross_attention_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2
+            self.encoder_hid_proj = ImageProjection(
+                image_embed_dim=encoder_hid_dim,
+                cross_attention_dim=cross_attention_dim,
+            )
+        elif encoder_hid_dim_type is not None:
+            raise ValueError(
+                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
+            )
+        else:
+            self.encoder_hid_proj = None

        # class embedding
-        self._set_class_embedding(
-            class_embed_type,
-            act_fn=act_fn,
-            num_class_embeds=num_class_embeds,
-            projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
-            time_embed_dim=time_embed_dim,
-            timestep_input_dim=timestep_input_dim,
-        )
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        elif class_embed_type == "projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
+            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
+            # 2. it projects from an arbitrary input dimension.
+            #
+            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
+            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
+            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
+            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif class_embed_type == "simple_projection":
+            if projection_class_embeddings_input_dim is None:
+                raise ValueError(
+                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
+                )
+            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
+        else:
+            self.class_embedding = None

-        self._set_add_embedding(
-            addition_embed_type,
-            addition_embed_type_num_heads=addition_embed_type_num_heads,
-            addition_time_embed_dim=addition_time_embed_dim,
-            cross_attention_dim=cross_attention_dim,
-            encoder_hid_dim=encoder_hid_dim,
-            flip_sin_to_cos=flip_sin_to_cos,
-            freq_shift=freq_shift,
-            projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
-            time_embed_dim=time_embed_dim,
-        )
+        if addition_embed_type == "text":
+            if encoder_hid_dim is not None:
+                text_time_embedding_from_dim = encoder_hid_dim
+            else:
+                text_time_embedding_from_dim = cross_attention_dim
+
+            self.add_embedding = TextTimeEmbedding(
+                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
+            )
+        elif addition_embed_type == "text_image":
+            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
+            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
+            self.add_embedding = TextImageTimeEmbedding(
+                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
+            )
+        elif addition_embed_type == "text_time":
+            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
+            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        elif addition_embed_type == "image":
+            # Kandinsky 2.2
+            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type == "image_hint":
+            # Kandinsky 2.2 ControlNet
+            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
+        elif addition_embed_type is not None:
+            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")

        if time_embedding_act_fn is None:
            self.time_embed_act = None
@@ -377,28 +478,57 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
            self.down_blocks.append(down_block)

        # mid
-        self.mid_block = get_mid_block(
-            mid_block_type,
-            temb_channels=blocks_time_embed_dim,
-            in_channels=block_out_channels[-1],
-            resnet_eps=norm_eps,
-            resnet_act_fn=act_fn,
-            resnet_groups=norm_num_groups,
-            output_scale_factor=mid_block_scale_factor,
-            transformer_layers_per_block=transformer_layers_per_block[-1],
-            num_attention_heads=num_attention_heads[-1],
-            cross_attention_dim=cross_attention_dim[-1],
-            dual_cross_attention=dual_cross_attention,
-            use_linear_projection=use_linear_projection,
-            mid_block_only_cross_attention=mid_block_only_cross_attention,
-            upcast_attention=upcast_attention,
-            resnet_time_scale_shift=resnet_time_scale_shift,
-            attention_type=attention_type,
-            resnet_skip_time_act=resnet_skip_time_act,
-            cross_attention_norm=cross_attention_norm,
-            attention_head_dim=attention_head_dim[-1],
-            dropout=dropout,
-        )
+        if mid_block_type == "UNetMidBlock2DCrossAttn":
+            self.mid_block = UNetMidBlock2DCrossAttn(
+                transformer_layers_per_block=transformer_layers_per_block[-1],
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim[-1],
+                num_attention_heads=num_attention_heads[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+                attention_type=attention_type,
+            )
+        elif mid_block_type == "UNetMidBlock2DSimpleCrossAttn":
+            self.mid_block = UNetMidBlock2DSimpleCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                cross_attention_dim=cross_attention_dim[-1],
+                attention_head_dim=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                skip_time_act=resnet_skip_time_act,
+                only_cross_attention=mid_block_only_cross_attention,
+                cross_attention_norm=cross_attention_norm,
+            )
+        elif mid_block_type == "UNetMidBlock2D":
+            self.mid_block = UNetMidBlock2D(
+                in_channels=block_out_channels[-1],
+                temb_channels=blocks_time_embed_dim,
+                dropout=dropout,
+                num_layers=0,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_groups=norm_num_groups,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                add_attention=False,
+            )
+        elif mid_block_type is None:
+            self.mid_block = None
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")

        # count how many layers upsample the images
        self.num_upsamplers = 0
@@ -477,206 +607,6 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
            block_out_channels[0], out_channels, kernel_size=conv_out_kernel, padding=conv_out_padding
        )

-        self._set_pos_net_if_use_gligen(attention_type=attention_type, cross_attention_dim=cross_attention_dim)
-
-    def _check_config(
-        self,
-        down_block_types: Tuple[str],
-        up_block_types: Tuple[str],
-        only_cross_attention: Union[bool, Tuple[bool]],
-        block_out_channels: Tuple[int],
-        layers_per_block: [int, Tuple[int]],
-        cross_attention_dim: Union[int, Tuple[int]],
-        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]],
-        reverse_transformer_layers_per_block: bool,
-        attention_head_dim: int,
-        num_attention_heads: Optional[Union[int, Tuple[int]]],
-    ):
-        if len(down_block_types) != len(up_block_types):
-            raise ValueError(
-                f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}."
-            )
-
-        if len(block_out_channels) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
-            )
-
-        if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}."
-            )
-
-        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
-            )
-
-        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
-            )
-
-        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
-            )
-
-        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
-            raise ValueError(
-                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
-            )
-        if isinstance(transformer_layers_per_block, list) and reverse_transformer_layers_per_block is None:
-            for layer_number_per_block in transformer_layers_per_block:
-                if isinstance(layer_number_per_block, list):
-                    raise ValueError("Must provide 'reverse_transformer_layers_per_block` if using asymmetrical UNet.")
-
-    def _set_time_proj(
-        self,
-        time_embedding_type: str,
-        block_out_channels: int,
-        flip_sin_to_cos: bool,
-        freq_shift: float,
-        time_embedding_dim: int,
-    ) -> Tuple[int, int]:
-        if time_embedding_type == "fourier":
-            time_embed_dim = time_embedding_dim or block_out_channels[0] * 2
-            if time_embed_dim % 2 != 0:
-                raise ValueError(f"`time_embed_dim` should be divisible by 2, but is {time_embed_dim}.")
-            self.time_proj = GaussianFourierProjection(
-                time_embed_dim // 2, set_W_to_weight=False, log=False, flip_sin_to_cos=flip_sin_to_cos
-            )
-            timestep_input_dim = time_embed_dim
-        elif time_embedding_type == "positional":
-            time_embed_dim = time_embedding_dim or block_out_channels[0] * 4
-
-            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
-            timestep_input_dim = block_out_channels[0]
-        else:
-            raise ValueError(
-                f"{time_embedding_type} does not exist. Please make sure to use one of `fourier` or `positional`."
-            )
-
-        return time_embed_dim, timestep_input_dim
-
-    def _set_encoder_hid_proj(
-        self,
-        encoder_hid_dim_type: Optional[str],
-        cross_attention_dim: Union[int, Tuple[int]],
-        encoder_hid_dim: Optional[int],
-    ):
-        if encoder_hid_dim_type is None and encoder_hid_dim is not None:
-            encoder_hid_dim_type = "text_proj"
-            self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type)
-            logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.")
-
-        if encoder_hid_dim is None and encoder_hid_dim_type is not None:
-            raise ValueError(
-                f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}."
-            )
-
-        if encoder_hid_dim_type == "text_proj":
-            self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim)
-        elif encoder_hid_dim_type == "text_image_proj":
-            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
-            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
-            self.encoder_hid_proj = TextImageProjection(
-                text_embed_dim=encoder_hid_dim,
-                image_embed_dim=cross_attention_dim,
-                cross_attention_dim=cross_attention_dim,
-            )
-        elif encoder_hid_dim_type == "image_proj":
-            # Kandinsky 2.2
-            self.encoder_hid_proj = ImageProjection(
-                image_embed_dim=encoder_hid_dim,
-                cross_attention_dim=cross_attention_dim,
-            )
-        elif encoder_hid_dim_type is not None:
-            raise ValueError(
-                f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'."
-            )
-        else:
-            self.encoder_hid_proj = None
-
-    def _set_class_embedding(
-        self,
-        class_embed_type: Optional[str],
-        act_fn: str,
-        num_class_embeds: Optional[int],
-        projection_class_embeddings_input_dim: Optional[int],
-        time_embed_dim: int,
-        timestep_input_dim: int,
-    ):
-        if class_embed_type is None and num_class_embeds is not None:
-            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
-        elif class_embed_type == "timestep":
-            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim, act_fn=act_fn)
-        elif class_embed_type == "identity":
-            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
-        elif class_embed_type == "projection":
-            if projection_class_embeddings_input_dim is None:
-                raise ValueError(
-                    "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set"
-                )
-            # The projection `class_embed_type` is the same as the timestep `class_embed_type` except
-            # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings
-            # 2. it projects from an arbitrary input dimension.
-            #
-            # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations.
-            # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings.
-            # As a result, `TimestepEmbedding` can be passed arbitrary vectors.
-            self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
-        elif class_embed_type == "simple_projection":
-            if projection_class_embeddings_input_dim is None:
-                raise ValueError(
-                    "`class_embed_type`: 'simple_projection' requires `projection_class_embeddings_input_dim` be set"
-                )
-            self.class_embedding = nn.Linear(projection_class_embeddings_input_dim, time_embed_dim)
-        else:
-            self.class_embedding = None
-
-    def _set_add_embedding(
-        self,
-        addition_embed_type: str,
-        addition_embed_type_num_heads: int,
-        addition_time_embed_dim: Optional[int],
-        flip_sin_to_cos: bool,
-        freq_shift: float,
-        cross_attention_dim: Optional[int],
-        encoder_hid_dim: Optional[int],
-        projection_class_embeddings_input_dim: Optional[int],
-        time_embed_dim: int,
-    ):
-        if addition_embed_type == "text":
-            if encoder_hid_dim is not None:
-                text_time_embedding_from_dim = encoder_hid_dim
-            else:
-                text_time_embedding_from_dim = cross_attention_dim
-
-            self.add_embedding = TextTimeEmbedding(
-                text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads
-            )
-        elif addition_embed_type == "text_image":
-            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
-            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
-            self.add_embedding = TextImageTimeEmbedding(
-                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
-            )
-        elif addition_embed_type == "text_time":
-            self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift)
-            self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
-        elif addition_embed_type == "image":
-            # Kandinsky 2.2
-            self.add_embedding = ImageTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
-        elif addition_embed_type == "image_hint":
-            # Kandinsky 2.2 ControlNet
-            self.add_embedding = ImageHintTimeEmbedding(image_embed_dim=encoder_hid_dim, time_embed_dim=time_embed_dim)
-        elif addition_embed_type is not None:
-            raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.")
-
-    def _set_pos_net_if_use_gligen(self, attention_type: str, cross_attention_dim: int):
        if attention_type in ["gated", "gated-text-image"]:
            positive_len = 768
            if isinstance(cross_attention_dim, int):
@@ -910,130 +840,6 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
            if hasattr(module, "set_lora_layer"):
                module.set_lora_layer(None)

-    def get_time_embed(
-        self, sample: torch.Tensor, timestep: Union[torch.Tensor, float, int]
-    ) -> Optional[torch.Tensor]:
-        timesteps = timestep
-        if not torch.is_tensor(timesteps):
-            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
-            # This would be a good case for the `match` statement (Python 3.10+)
-            is_mps = sample.device.type == "mps"
-            if isinstance(timestep, float):
-                dtype = torch.float32 if is_mps else torch.float64
-            else:
-                dtype = torch.int32 if is_mps else torch.int64
-            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
-        elif len(timesteps.shape) == 0:
-            timesteps = timesteps[None].to(sample.device)
-
-        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
-        timesteps = timesteps.expand(sample.shape[0])
-
-        t_emb = self.time_proj(timesteps)
-        # `Timesteps` does not contain any weights and will always return f32 tensors
-        # but time_embedding might actually be running in fp16. so we need to cast here.
-        # there might be better ways to encapsulate this.
-        t_emb = t_emb.to(dtype=sample.dtype)
-        return t_emb
-
-    def get_class_embed(self, sample: torch.Tensor, class_labels: Optional[torch.Tensor]) -> Optional[torch.Tensor]:
-        class_emb = None
-        if self.class_embedding is not None:
-            if class_labels is None:
-                raise ValueError("class_labels should be provided when num_class_embeds > 0")
-
-            if self.config.class_embed_type == "timestep":
-                class_labels = self.time_proj(class_labels)
-
-                # `Timesteps` does not contain any weights and will always return f32 tensors
-                # there might be better ways to encapsulate this.
-                class_labels = class_labels.to(dtype=sample.dtype)
-
-            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
-        return class_emb
-
-    def get_aug_embed(
-        self, emb: torch.Tensor, encoder_hidden_states: torch.Tensor, added_cond_kwargs: Dict
-    ) -> Optional[torch.Tensor]:
-        aug_emb = None
-        if self.config.addition_embed_type == "text":
-            aug_emb = self.add_embedding(encoder_hidden_states)
-        elif self.config.addition_embed_type == "text_image":
-            # Kandinsky 2.1 - style
-            if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
-                )
-
-            image_embs = added_cond_kwargs.get("image_embeds")
-            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
-            aug_emb = self.add_embedding(text_embs, image_embs)
-        elif self.config.addition_embed_type == "text_time":
-            # SDXL - style
-            if "text_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
-                )
-            text_embeds = added_cond_kwargs.get("text_embeds")
-            if "time_ids" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
-                )
-            time_ids = added_cond_kwargs.get("time_ids")
-            time_embeds = self.add_time_proj(time_ids.flatten())
-            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
-            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
-            add_embeds = add_embeds.to(emb.dtype)
-            aug_emb = self.add_embedding(add_embeds)
-        elif self.config.addition_embed_type == "image":
-            # Kandinsky 2.2 - style
-            if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
-                )
-            image_embs = added_cond_kwargs.get("image_embeds")
-            aug_emb = self.add_embedding(image_embs)
-        elif self.config.addition_embed_type == "image_hint":
-            # Kandinsky 2.2 - style
-            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
-                )
-            image_embs = added_cond_kwargs.get("image_embeds")
-            hint = added_cond_kwargs.get("hint")
-            aug_emb = self.add_embedding(image_embs, hint)
-        return aug_emb
-
-    def process_encoder_hidden_states(self, encoder_hidden_states: torch.Tensor, added_cond_kwargs) -> torch.Tensor:
-        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
-            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
-        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
-            # Kadinsky 2.1 - style
-            if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
-                )
-
-            image_embeds = added_cond_kwargs.get("image_embeds")
-            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
-        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
-            # Kandinsky 2.2 - style
-            if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
-                )
-            image_embeds = added_cond_kwargs.get("image_embeds")
-            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
-        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
-            if "image_embeds" not in added_cond_kwargs:
-                raise ValueError(
-                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
-                )
-            image_embeds = added_cond_kwargs.get("image_embeds")
-            image_embeds = self.encoder_hid_proj(image_embeds)
-            encoder_hidden_states = (encoder_hidden_states, image_embeds)
-        return encoder_hidden_states
-
    def forward(
        self,
        sample: torch.FloatTensor,
@@ -1146,22 +952,96 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
            sample = 2 * sample - 1.0

        # 1. time
-        t_emb = self.get_time_embed(sample=sample, timestep=timestep)
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+
+        t_emb = self.time_proj(timesteps)
+
+        # `Timesteps` does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=sample.dtype)
+
        emb = self.time_embedding(t_emb, timestep_cond)
        aug_emb = None

-        class_emb = self.get_class_embed(sample=sample, class_labels=class_labels)
-        if class_emb is not None:
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+
+                # `Timesteps` does not contain any weights and will always return f32 tensors
+                # there might be better ways to encapsulate this.
+                class_labels = class_labels.to(dtype=sample.dtype)
+
+            class_emb = self.class_embedding(class_labels).to(dtype=sample.dtype)
+
            if self.config.class_embeddings_concat:
                emb = torch.cat([emb, class_emb], dim=-1)
            else:
                emb = emb + class_emb

-        aug_emb = self.get_aug_embed(
-            emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
-        )
-        if self.config.addition_embed_type == "image_hint":
-            aug_emb, hint = aug_emb
+        if self.config.addition_embed_type == "text":
+            aug_emb = self.add_embedding(encoder_hidden_states)
+        elif self.config.addition_embed_type == "text_image":
+            # Kandinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+
+            image_embs = added_cond_kwargs.get("image_embeds")
+            text_embs = added_cond_kwargs.get("text_embeds", encoder_hidden_states)
+            aug_emb = self.add_embedding(text_embs, image_embs)
+        elif self.config.addition_embed_type == "text_time":
+            # SDXL - style
+            if "text_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if "time_ids" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            time_embeds = self.add_time_proj(time_ids.flatten())
+            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
+            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
+            add_embeds = add_embeds.to(emb.dtype)
+            aug_emb = self.add_embedding(add_embeds)
+        elif self.config.addition_embed_type == "image":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image' which requires the keyword argument `image_embeds` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            aug_emb = self.add_embedding(image_embs)
+        elif self.config.addition_embed_type == "image_hint":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs or "hint" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'image_hint' which requires the keyword arguments `image_embeds` and `hint` to be passed in `added_cond_kwargs`"
+                )
+            image_embs = added_cond_kwargs.get("image_embeds")
+            hint = added_cond_kwargs.get("hint")
+            aug_emb, hint = self.add_embedding(image_embs, hint)
            sample = torch.cat([sample, hint], dim=1)

        emb = emb + aug_emb if aug_emb is not None else emb
@@ -1169,9 +1049,33 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
        if self.time_embed_act is not None:
            emb = self.time_embed_act(emb)

-        encoder_hidden_states = self.process_encoder_hidden_states(
-            encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs
-        )
+        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
+            # Kadinsky 2.1 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states, image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "image_proj":
+            # Kandinsky 2.2 - style
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            encoder_hidden_states = self.encoder_hid_proj(image_embeds)
+        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "ip_image_proj":
+            if "image_embeds" not in added_cond_kwargs:
+                raise ValueError(
+                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'ip_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
+                )
+            image_embeds = added_cond_kwargs.get("image_embeds")
+            image_embeds = self.encoder_hid_proj(image_embeds)
+            encoder_hidden_states = (encoder_hidden_states, image_embeds)

        # 2. pre-process
        sample = self.conv_in(sample)
@@ -54,7 +54,7 @@ class UNet3DConditionOutput(BaseOutput):
    The output of [`UNet3DConditionModel`].

    Args:
-        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, num_frames, height, width)`):
+        sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
    """

@@ -74,9 +74,9 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
            Height and width of input/output sample.
        in_channels (`int`, *optional*, defaults to 4): The number of channels in the input sample.
        out_channels (`int`, *optional*, defaults to 4): The number of channels in the output.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "CrossAttnDownBlock3D", "DownBlock3D")`):
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
            The tuple of downsample blocks to use.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D")`):
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
            The tuple of upsample blocks to use.
        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
            The tuple of output channels for each block.
@@ -87,8 +87,8 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
            If `None`, normalization and activation layers is skipped in post-processing.
        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
-        cross_attention_dim (`int`, *optional*, defaults to 1024): The dimension of the cross attention features.
-        attention_head_dim (`int`, *optional*, defaults to 64): The dimension of the attention heads.
+        cross_attention_dim (`int`, *optional*, defaults to 1280): The dimension of the cross attention features.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
        num_attention_heads (`int`, *optional*): The number of attention heads.
    """

@@ -533,7 +533,7 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)

        Args:
            sample (`torch.FloatTensor`):
-                The noisy input tensor with the following shape `(batch, num_channels, num_frames, height, width`.
+                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width`.
            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
            encoder_hidden_states (`torch.FloatTensor`):
                The encoder hidden states with shape `(batch, sequence_length, feature_dim)`.
@@ -13,10 +13,12 @@
 # limitations under the License.

 import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
+import math
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union

 import numpy as np
 import torch
+import torch.fft as fft
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

 from ...image_processor import PipelineImageInput, VaeImageProcessor
@@ -41,7 +43,6 @@ from ...utils import (
    unscale_lora_layers,
 )
 from ...utils.torch_utils import randn_tensor
-from ..free_init_utils import FreeInitMixin
 from ..pipeline_utils import DiffusionPipeline
 from .pipeline_output import AnimateDiffPipelineOutput

@@ -86,9 +87,72 @@ def tensor2vid(video: torch.Tensor, processor: "VaeImageProcessor", output_type:
    return outputs


-class AnimateDiffPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FreeInitMixin
-):
+def _get_freeinit_freq_filter(
+    shape: Tuple[int, ...],
+    device: Union[str, torch.dtype],
+    filter_type: str,
+    order: float,
+    spatial_stop_frequency: float,
+    temporal_stop_frequency: float,
+) -> torch.Tensor:
+    r"""Returns the FreeInit filter based on filter type and other input conditions."""
+
+    T, H, W = shape[-3], shape[-2], shape[-1]
+    mask = torch.zeros(shape)
+
+    if spatial_stop_frequency == 0 or temporal_stop_frequency == 0:
+        return mask
+
+    if filter_type == "butterworth":
+
+        def retrieve_mask(x):
+            return 1 / (1 + (x / spatial_stop_frequency**2) ** order)
+    elif filter_type == "gaussian":
+
+        def retrieve_mask(x):
+            return math.exp(-1 / (2 * spatial_stop_frequency**2) * x)
+    elif filter_type == "ideal":
+
+        def retrieve_mask(x):
+            return 1 if x <= spatial_stop_frequency * 2 else 0
+    else:
+        raise NotImplementedError("`filter_type` must be one of gaussian, butterworth or ideal")
+
+    for t in range(T):
+        for h in range(H):
+            for w in range(W):
+                d_square = (
+                    ((spatial_stop_frequency / temporal_stop_frequency) * (2 * t / T - 1)) ** 2
+                    + (2 * h / H - 1) ** 2
+                    + (2 * w / W - 1) ** 2
+                )
+                mask[..., t, h, w] = retrieve_mask(d_square)
+
+    return mask.to(device)
+
+
+def _freq_mix_3d(x: torch.Tensor, noise: torch.Tensor, LPF: torch.Tensor) -> torch.Tensor:
+    r"""Noise reinitialization."""
+    # FFT
+    x_freq = fft.fftn(x, dim=(-3, -2, -1))
+    x_freq = fft.fftshift(x_freq, dim=(-3, -2, -1))
+    noise_freq = fft.fftn(noise, dim=(-3, -2, -1))
+    noise_freq = fft.fftshift(noise_freq, dim=(-3, -2, -1))
+
+    # frequency mix
+    HPF = 1 - LPF
+    x_freq_low = x_freq * LPF
+    noise_freq_high = noise_freq * HPF
+    x_freq_mixed = x_freq_low + noise_freq_high  # mix in freq domain
+
+    # IFFT
+    x_freq_mixed = fft.ifftshift(x_freq_mixed, dim=(-3, -2, -1))
+    x_mixed = fft.ifftn(x_freq_mixed, dim=(-3, -2, -1)).real
+
+    return x_mixed
+
+
+class AnimateDiffPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
    r"""
    Pipeline for text-to-video generation.

@@ -118,7 +182,7 @@ class AnimateDiffPipeline(
    """

    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
-    _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
+    _optional_components = ["feature_extractor", "image_encoder"]
    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]

    def __init__(
@@ -140,8 +204,7 @@ class AnimateDiffPipeline(
        image_encoder: CLIPVisionModelWithProjection = None,
    ):
        super().__init__()
-        if isinstance(unet, UNet2DConditionModel):
-            unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+        unet = UNetMotionModel.from_unet2d(unet, motion_adapter)

        self.register_modules(
            vae=vae,
@@ -467,10 +530,63 @@ class AnimateDiffPipeline(
            raise ValueError("The pipeline must have `unet` for using FreeU.")
        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)

+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_freeu
    def disable_freeu(self):
        """Disables the FreeU mechanism if enabled."""
        self.unet.disable_freeu()

+    @property
+    def free_init_enabled(self):
+        return hasattr(self, "_free_init_num_iters") and self._free_init_num_iters is not None
+
+    def enable_free_init(
+        self,
+        num_iters: int = 3,
+        use_fast_sampling: bool = False,
+        method: str = "butterworth",
+        order: int = 4,
+        spatial_stop_frequency: float = 0.25,
+        temporal_stop_frequency: float = 0.25,
+        generator: torch.Generator = None,
+    ):
+        """Enables the FreeInit mechanism as in https://arxiv.org/abs/2312.07537.
+
+        This implementation has been adapted from the [official repository](https://github.com/TianxingWu/FreeInit).
+
+        Args:
+            num_iters (`int`, *optional*, defaults to `3`):
+                Number of FreeInit noise re-initialization iterations.
+            use_fast_sampling (`bool`, *optional*, defaults to `False`):
+                Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables
+                the "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
+            method (`str`, *optional*, defaults to `butterworth`):
+                Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the
+                FreeInit low pass filter.
+            order (`int`, *optional*, defaults to `4`):
+                Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour
+                whereas lower values lead to `gaussian` method behaviour.
+            spatial_stop_frequency (`float`, *optional*, defaults to `0.25`):
+                Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in
+                the original implementation.
+            temporal_stop_frequency (`float`, *optional*, defaults to `0.25`):
+                Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in
+                the original implementation.
+            generator (`torch.Generator`, *optional*, defaults to `0.25`):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                FreeInit generation deterministic.
+        """
+        self._free_init_num_iters = num_iters
+        self._free_init_use_fast_sampling = use_fast_sampling
+        self._free_init_method = method
+        self._free_init_order = order
+        self._free_init_spatial_stop_frequency = spatial_stop_frequency
+        self._free_init_temporal_stop_frequency = temporal_stop_frequency
+        self._free_init_generator = generator
+
+    def disable_free_init(self):
+        """Disables the FreeInit mechanism if enabled."""
+        self._free_init_num_iters = None
+
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
    def prepare_extra_step_kwargs(self, generator, eta):
        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -575,6 +691,158 @@ class AnimateDiffPipeline(
        latents = latents * self.scheduler.init_noise_sigma
        return latents

+    def _denoise_loop(
+        self,
+        timesteps,
+        num_inference_steps,
+        do_classifier_free_guidance,
+        guidance_scale,
+        num_warmup_steps,
+        prompt_embeds,
+        negative_prompt_embeds,
+        latents,
+        cross_attention_kwargs,
+        added_cond_kwargs,
+        extra_step_kwargs,
+        callback,
+        callback_steps,
+        callback_on_step_end,
+        callback_on_step_end_tensor_inputs,
+    ):
+        """Denoising loop for AnimateDiff."""
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+
+        return latents
+
+    def _free_init_loop(
+        self,
+        height,
+        width,
+        num_frames,
+        num_channels_latents,
+        batch_size,
+        num_videos_per_prompt,
+        denoise_args,
+        device,
+    ):
+        """Denoising loop for AnimateDiff using FreeInit noise reinitialization technique."""
+
+        latents = denoise_args.get("latents")
+        prompt_embeds = denoise_args.get("prompt_embeds")
+        timesteps = denoise_args.get("timesteps")
+        num_inference_steps = denoise_args.get("num_inference_steps")
+
+        latent_shape = (
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        free_init_filter_shape = (
+            1,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        free_init_freq_filter = _get_freeinit_freq_filter(
+            shape=free_init_filter_shape,
+            device=device,
+            filter_type=self._free_init_method,
+            order=self._free_init_order,
+            spatial_stop_frequency=self._free_init_spatial_stop_frequency,
+            temporal_stop_frequency=self._free_init_temporal_stop_frequency,
+        )
+
+        with self.progress_bar(total=self._free_init_num_iters) as free_init_progress_bar:
+            for i in range(self._free_init_num_iters):
+                # For the first FreeInit iteration, the original latent is used without modification.
+                # Subsequent iterations apply the noise reinitialization technique.
+                if i == 0:
+                    initial_noise = latents.detach().clone()
+                else:
+                    current_diffuse_timestep = (
+                        self.scheduler.config.num_train_timesteps - 1
+                    )  # diffuse to t=999 noise level
+                    diffuse_timesteps = torch.full((batch_size,), current_diffuse_timestep).long()
+                    z_T = self.scheduler.add_noise(
+                        original_samples=latents, noise=initial_noise, timesteps=diffuse_timesteps.to(device)
+                    ).to(dtype=torch.float32)
+                    z_rand = randn_tensor(
+                        shape=latent_shape,
+                        generator=self._free_init_generator,
+                        device=device,
+                        dtype=torch.float32,
+                    )
+                    latents = _freq_mix_3d(z_T, z_rand, LPF=free_init_freq_filter)
+                    latents = latents.to(prompt_embeds.dtype)
+
+                # Coarse-to-Fine Sampling for faster inference (can lead to lower quality)
+                if self._free_init_use_fast_sampling:
+                    current_num_inference_steps = int(num_inference_steps / self._free_init_num_iters * (i + 1))
+                    self.scheduler.set_timesteps(current_num_inference_steps, device=device)
+                    timesteps = self.scheduler.timesteps
+                    denoise_args.update({"timesteps": timesteps, "num_inference_steps": current_num_inference_steps})
+
+                num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+                denoise_args.update({"latents": latents, "num_warmup_steps": num_warmup_steps})
+                latents = self._denoise_loop(**denoise_args)
+
+                free_init_progress_bar.update()
+
+        return latents
+
+    def _retrieve_video_frames(self, latents, output_type, return_dict):
+        """Helper function to handle latents to output conversion."""
+        if output_type == "latent":
+            return AnimateDiffPipelineOutput(frames=latents)
+
+        video_tensor = self.decode_latents(latents)
+        video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+
+        if not return_dict:
+            return (video,)
+
+        return AnimateDiffPipelineOutput(frames=video)
+
    @property
    def guidance_scale(self):
        return self._guidance_scale
@@ -778,6 +1046,7 @@ class AnimateDiffPipeline(
        # 4. Prepare timesteps
        self.scheduler.set_timesteps(num_inference_steps, device=device)
        timesteps = self.scheduler.timesteps
+        self._num_timesteps = len(timesteps)

        # 5. Prepare latent variables
        num_channels_latents = self.unet.config.in_channels
@@ -797,70 +1066,45 @@ class AnimateDiffPipeline(
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

        # 7. Add image embeds for IP-Adapter
-        added_cond_kwargs = (
-            {"image_embeds": image_embeds}
-            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
-            else None
-        )
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None

-        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
-        for free_init_iter in range(num_free_init_iters):
-            if self.free_init_enabled:
-                latents, timesteps = self._apply_free_init(
-                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
-                )
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        denoise_args = {
+            "timesteps": timesteps,
+            "num_inference_steps": num_inference_steps,
+            "do_classifier_free_guidance": self.do_classifier_free_guidance,
+            "guidance_scale": guidance_scale,
+            "num_warmup_steps": num_warmup_steps,
+            "prompt_embeds": prompt_embeds,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "latents": latents,
+            "cross_attention_kwargs": self.cross_attention_kwargs,
+            "added_cond_kwargs": added_cond_kwargs,
+            "extra_step_kwargs": extra_step_kwargs,
+            "callback": callback,
+            "callback_steps": callback_steps,
+            "callback_on_step_end": callback_on_step_end,
+            "callback_on_step_end_tensor_inputs": callback_on_step_end_tensor_inputs,
+        }

-            self._num_timesteps = len(timesteps)
-            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-            with self.progress_bar(total=num_inference_steps) as progress_bar:
-                for i, t in enumerate(timesteps):
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+        if self.free_init_enabled:
+            latents = self._free_init_loop(
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                num_channels_latents=num_channels_latents,
+                batch_size=batch_size,
+                num_videos_per_prompt=num_videos_per_prompt,
+                denoise_args=denoise_args,
+                device=device,
+            )
+        else:
+            latents = self._denoise_loop(**denoise_args)

-                    # predict the noise residual
-                    noise_pred = self.unet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=prompt_embeds,
-                        cross_attention_kwargs=cross_attention_kwargs,
-                        added_cond_kwargs=added_cond_kwargs,
-                    ).sample
-
-                    # perform guidance
-                    if self.do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                    # compute the previous noisy sample x_t -> x_t-1
-                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                    if callback_on_step_end is not None:
-                        callback_kwargs = {}
-                        for k in callback_on_step_end_tensor_inputs:
-                            callback_kwargs[k] = locals()[k]
-                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                        latents = callback_outputs.pop("latents", latents)
-                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
-                    # call the callback, if provided
-                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                        progress_bar.update()
-                        if callback is not None and i % callback_steps == 0:
-                            callback(i, t, latents)
-
-        if output_type == "latent":
-            return AnimateDiffPipelineOutput(frames=latents)
-
-        video_tensor = self.decode_latents(latents)
-        video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+        video = self._retrieve_video_frames(latents, output_type, return_dict)

        # 9. Offload all models
        self.maybe_free_model_hooks()

-        if not return_dict:
-            return (video,)
-
-        return AnimateDiffPipelineOutput(frames=video)
+        return video
@@ -34,7 +34,6 @@ from ...schedulers import (
 )
 from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
 from ...utils.torch_utils import randn_tensor
-from ..free_init_utils import FreeInitMixin
 from ..pipeline_utils import DiffusionPipeline
 from .pipeline_output import AnimateDiffPipelineOutput

@@ -164,9 +163,7 @@ def retrieve_timesteps(
    return timesteps, num_inference_steps


-class AnimateDiffVideoToVideoPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FreeInitMixin
-):
+class AnimateDiffVideoToVideoPipeline(DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin):
    r"""
    Pipeline for video-to-video generation.

@@ -196,7 +193,7 @@ class AnimateDiffVideoToVideoPipeline(
    """

    model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae"
-    _optional_components = ["feature_extractor", "image_encoder", "motion_adapter"]
+    _optional_components = ["feature_extractor", "image_encoder"]
    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]

    def __init__(
@@ -218,8 +215,7 @@ class AnimateDiffVideoToVideoPipeline(
        image_encoder: CLIPVisionModelWithProjection = None,
    ):
        super().__init__()
-        if isinstance(unet, UNet2DConditionModel):
-            unet = UNetMotionModel.from_unet2d(unet, motion_adapter)
+        unet = UNetMotionModel.from_unet2d(unet, motion_adapter)

        self.register_modules(
            vae=vae,
@@ -441,41 +437,6 @@ class AnimateDiffVideoToVideoPipeline(

            return image_embeds, uncond_image_embeds

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
-    def prepare_ip_adapter_image_embeds(
-        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
-    ):
-        if ip_adapter_image_embeds is None:
-            if not isinstance(ip_adapter_image, list):
-                ip_adapter_image = [ip_adapter_image]
-
-            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
-                raise ValueError(
-                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
-                )
-
-            image_embeds = []
-            for single_ip_adapter_image, image_proj_layer in zip(
-                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
-            ):
-                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
-                single_image_embeds, single_negative_image_embeds = self.encode_image(
-                    single_ip_adapter_image, device, 1, output_hidden_state
-                )
-                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
-                single_negative_image_embeds = torch.stack(
-                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
-                )
-
-                if self.do_classifier_free_guidance:
-                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
-                    single_image_embeds = single_image_embeds.to(device)
-
-                image_embeds.append(single_image_embeds)
-        else:
-            image_embeds = ip_adapter_image_embeds
-        return image_embeds
-
    # Copied from diffusers.pipelines.text_to_video_synthesis/pipeline_text_to_video_synth.TextToVideoSDPipeline.decode_latents
    def decode_latents(self, latents):
        latents = 1 / self.vae.config.scaling_factor * latents
@@ -623,12 +584,12 @@ class AnimateDiffVideoToVideoPipeline(
        if video is not None and latents is not None:
            raise ValueError("Only one of `video` or `latents` should be provided")

-    def get_timesteps(self, num_inference_steps, timesteps, strength, device):
+    def get_timesteps(self, num_inference_steps, strength, device):
        # get the original timestep using init_timestep
        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)

        t_start = max(num_inference_steps - init_timestep, 0)
-        timesteps = timesteps[t_start * self.scheduler.order :]
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]

        return timesteps, num_inference_steps - t_start

@@ -770,7 +731,6 @@ class AnimateDiffVideoToVideoPipeline(
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
        ip_adapter_image: Optional[PipelineImageInput] = None,
-        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -820,9 +780,6 @@ class AnimateDiffVideoToVideoPipeline(
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
-            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. If not
-                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated video. Choose between `torch.FloatTensor`, `PIL.Image` or
                `np.array`.
@@ -909,15 +866,19 @@ class AnimateDiffVideoToVideoPipeline(
        if self.do_classifier_free_guidance:
            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])

-        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
-            image_embeds = self.prepare_ip_adapter_image_embeds(
-                ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_videos_per_prompt
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_videos_per_prompt, output_hidden_state
            )
+            if self.do_classifier_free_guidance:
+                image_embeds = torch.cat([negative_image_embeds, image_embeds])

        # 4. Prepare timesteps
        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
        latent_timestep = timesteps[:1].repeat(batch_size * num_videos_per_prompt)
+        self._num_timesteps = len(timesteps)

        # 5. Prepare latent variables
        num_channels_latents = self.unet.config.in_channels
@@ -938,61 +899,44 @@ class AnimateDiffVideoToVideoPipeline(
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

        # 7. Add image embeds for IP-Adapter
-        added_cond_kwargs = (
-            {"image_embeds": image_embeds}
-            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
-            else None
-        )
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None

-        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
-        for free_init_iter in range(num_free_init_iters):
-            if self.free_init_enabled:
-                latents, timesteps = self._apply_free_init(
-                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
-                )
-                num_inference_steps = len(timesteps)
-                # make sure to readjust timesteps based on strength
-                timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, timesteps, strength, device)
+        # 8. Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)

-            self._num_timesteps = len(timesteps)
-            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-            # 8. Denoising loop
-            with self.progress_bar(total=num_inference_steps) as progress_bar:
-                for i, t in enumerate(timesteps):
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=self.cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                ).sample

-                    # predict the noise residual
-                    noise_pred = self.unet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=prompt_embeds,
-                        cross_attention_kwargs=self.cross_attention_kwargs,
-                        added_cond_kwargs=added_cond_kwargs,
-                    ).sample
+                # perform guidance
+                if self.do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)

-                    # perform guidance
-                    if self.do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample

-                    # compute the previous noisy sample x_t -> x_t-1
-                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)

-                    if callback_on_step_end is not None:
-                        callback_kwargs = {}
-                        for k in callback_on_step_end_tensor_inputs:
-                            callback_kwargs[k] = locals()[k]
-                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)

-                        latents = callback_outputs.pop("latents", latents)
-                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
-                    # call the callback, if provided
-                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                        progress_bar.update()
+                progress_bar.update()

        if output_type == "latent":
            return AnimateDiffPipelineOutput(frames=latents)
@@ -1206,11 +1206,7 @@ class StableDiffusionControlNetPipeline(
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

        # 7.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = (
-            {"image_embeds": image_embeds}
-            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
-            else None
-        )
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None

        # 7.2 Create tensor stating which controlnets to keep
        controlnet_keep = []
@@ -972,12 +972,6 @@ class StableDiffusionControlNetImg2ImgPipeline(
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The width in pixels of the generated image.
-            strength (`float`, *optional*, defaults to 0.8):
-                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
-                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
-                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
-                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
-                essentially ignores `image`.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
@@ -1206,11 +1200,7 @@ class StableDiffusionControlNetImg2ImgPipeline(
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

        # 7.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = (
-            {"image_embeds": image_embeds}
-            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
-            else None
-        )
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None

        # 7.2 Create tensor stating which controlnets to keep
        controlnet_keep = []
@@ -1495,11 +1495,7 @@ class StableDiffusionControlNetInpaintPipeline(
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

        # 7.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = (
-            {"image_embeds": image_embeds}
-            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
-            else None
-        )
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None

        # 7.2 Create tensor stating which controlnets to keep
        controlnet_keep = []
@@ -19,22 +19,11 @@ import numpy as np
 import PIL.Image
 import torch
 import torch.nn.functional as F
-from transformers import (
-    CLIPImageProcessor,
-    CLIPTextModel,
-    CLIPTextModelWithProjection,
-    CLIPTokenizer,
-    CLIPVisionModelWithProjection,
-)
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer

 from ...image_processor import PipelineImageInput, VaeImageProcessor
-from ...loaders import (
-    FromSingleFileMixin,
-    IPAdapterMixin,
-    StableDiffusionXLLoraLoaderMixin,
-    TextualInversionLoaderMixin,
-)
-from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
+from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
+from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
 from ...models.attention_processor import (
    AttnProcessor2_0,
    LoRAAttnProcessor2_0,
@@ -151,7 +140,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):


 class StableDiffusionXLControlNetInpaintPipeline(
-    DiffusionPipeline, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin, IPAdapterMixin
+    DiffusionPipeline, StableDiffusionXLLoraLoaderMixin, FromSingleFileMixin
 ):
    r"""
    Pipeline for text-to-image generation using Stable Diffusion XL.
@@ -163,7 +152,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
        - [`~loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] for loading LoRA weights
        - [`~loaders.StableDiffusionXLLoraLoaderMixin.save_lora_weights`] for saving LoRA weights
        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
-        - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters

    Args:
        vae ([`AutoencoderKL`]):
@@ -207,8 +195,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
        requires_aesthetics_score: bool = False,
        force_zeros_for_empty_prompt: bool = True,
        add_watermarker: Optional[bool] = None,
-        feature_extractor: Optional[CLIPImageProcessor] = None,
-        image_encoder: Optional[CLIPVisionModelWithProjection] = None,
    ):
        super().__init__()

@@ -224,8 +210,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
            unet=unet,
            controlnet=controlnet,
            scheduler=scheduler,
-            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
        )
        self.register_to_config(force_zeros_for_empty_prompt=force_zeros_for_empty_prompt)
        self.register_to_config(requires_aesthetics_score=requires_aesthetics_score)
@@ -513,66 +497,6 @@ class StableDiffusionXLControlNetInpaintPipeline(

        return prompt_embeds, negative_prompt_embeds, pooled_prompt_embeds, negative_pooled_prompt_embeds

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
-    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
-        dtype = next(self.image_encoder.parameters()).dtype
-
-        if not isinstance(image, torch.Tensor):
-            image = self.feature_extractor(image, return_tensors="pt").pixel_values
-
-        image = image.to(device=device, dtype=dtype)
-        if output_hidden_states:
-            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
-            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_enc_hidden_states = self.image_encoder(
-                torch.zeros_like(image), output_hidden_states=True
-            ).hidden_states[-2]
-            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
-                num_images_per_prompt, dim=0
-            )
-            return image_enc_hidden_states, uncond_image_enc_hidden_states
-        else:
-            image_embeds = self.image_encoder(image).image_embeds
-            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_embeds = torch.zeros_like(image_embeds)
-
-            return image_embeds, uncond_image_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
-    def prepare_ip_adapter_image_embeds(
-        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
-    ):
-        if ip_adapter_image_embeds is None:
-            if not isinstance(ip_adapter_image, list):
-                ip_adapter_image = [ip_adapter_image]
-
-            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
-                raise ValueError(
-                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
-                )
-
-            image_embeds = []
-            for single_ip_adapter_image, image_proj_layer in zip(
-                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
-            ):
-                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
-                single_image_embeds, single_negative_image_embeds = self.encode_image(
-                    single_ip_adapter_image, device, 1, output_hidden_state
-                )
-                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
-                single_negative_image_embeds = torch.stack(
-                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
-                )
-
-                if self.do_classifier_free_guidance:
-                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
-                    single_image_embeds = single_image_embeds.to(device)
-
-                image_embeds.append(single_image_embeds)
-        else:
-            image_embeds = ip_adapter_image_embeds
-        return image_embeds
-
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
    def prepare_extra_step_kwargs(self, generator, eta):
        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -642,8 +566,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
        negative_prompt_2=None,
        prompt_embeds=None,
        negative_prompt_embeds=None,
-        ip_adapter_image=None,
-        ip_adapter_image_embeds=None,
        pooled_prompt_embeds=None,
        negative_pooled_prompt_embeds=None,
        controlnet_conditioning_scale=1.0,
@@ -830,11 +752,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
            if end > 1.0:
                raise ValueError(f"control guidance end: {end} can't be larger than 1.0.")

-        if ip_adapter_image is not None and ip_adapter_image_embeds is not None:
-            raise ValueError(
-                "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined."
-            )
-
    def prepare_control_image(
        self,
        image,
@@ -1183,8 +1100,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
-        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
        output_type: Optional[str] = "pil",
@@ -1279,10 +1194,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
                argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
-            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. If not
-                provided, embeddings are computed from the `ip_adapter_image` input argument.
            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
                If not provided, pooled text embeddings will be generated from `prompt` input argument.
@@ -1415,8 +1326,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
            negative_prompt_2,
            prompt_embeds,
            negative_prompt_embeds,
-            ip_adapter_image,
-            ip_adapter_image_embeds,
            pooled_prompt_embeds,
            negative_pooled_prompt_embeds,
            controlnet_conditioning_scale,
@@ -1469,12 +1378,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
            clip_skip=self.clip_skip,
        )

-        # 3.1 Encode ip_adapter_image
-        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
-            image_embeds = self.prepare_ip_adapter_image_embeds(
-                ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt
-            )
-
        # 4. set timesteps
        def denoising_value_valid(dnv):
            return isinstance(denoising_end, float) and 0 < dnv < 1
@@ -1746,9 +1649,6 @@ class StableDiffusionXLControlNetInpaintPipeline(
                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])

-                if ip_adapter_image is not None:
-                    added_cond_kwargs["image_embeds"] = image_embeds
-
                if num_channels_unet == 9:
                    latent_model_input = torch.cat([latent_model_input, mask, masked_image_latents], dim=1)

@@ -1156,15 +1156,15 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
                The width in pixels of the generated image. Anything below 512 pixels won't work well for
                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
                and checkpoints that are not specifically fine-tuned on low resolutions.
-            strength (`float`, *optional*, defaults to 0.8):
-                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
-                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
-                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
-                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
-                essentially ignores `image`.
            num_inference_steps (`int`, *optional*, defaults to 50):
                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
                expense of slower inference.
+            strength (`float`, *optional*, defaults to 0.3):
+                Conceptually, indicates how much to transform the reference `image`. Must be between 0 and 1. `image`
+                will be used as a starting point, adding more noise to it the larger the `strength`. The number of
+                denoising steps depends on the amount of noise initially added. When `strength` is 1, added noise will
+                be maximum and the denoising process will run for the full number of iterations specified in
+                `num_inference_steps`.
            guidance_scale (`float`, *optional*, defaults to 7.5):
                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
                `guidance_scale` is defined as `w` of equation 2. of [Imagen
@@ -268,6 +268,7 @@ class GLIGENTextBoundingboxProjection(nn.Module):
        return objs


+# Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel with UNet2DConditionModel->UNetFlatConditionModel, nn.Conv2d->LinearMultiDim, Block2D->BlockFlat
 class UNetFlatConditionModel(ModelMixin, ConfigMixin):
    r"""
    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
@@ -1,184 +0,0 @@
-# Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from typing import Tuple, Union
-
-import torch
-import torch.fft as fft
-
-from ..utils.torch_utils import randn_tensor
-
-
-class FreeInitMixin:
-    r"""Mixin class for FreeInit."""
-
-    def enable_free_init(
-        self,
-        num_iters: int = 3,
-        use_fast_sampling: bool = False,
-        method: str = "butterworth",
-        order: int = 4,
-        spatial_stop_frequency: float = 0.25,
-        temporal_stop_frequency: float = 0.25,
-    ):
-        """Enables the FreeInit mechanism as in https://arxiv.org/abs/2312.07537.
-
-        This implementation has been adapted from the [official repository](https://github.com/TianxingWu/FreeInit).
-
-        Args:
-            num_iters (`int`, *optional*, defaults to `3`):
-                Number of FreeInit noise re-initialization iterations.
-            use_fast_sampling (`bool`, *optional*, defaults to `False`):
-                Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables
-                the "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
-            method (`str`, *optional*, defaults to `butterworth`):
-                Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the
-                FreeInit low pass filter.
-            order (`int`, *optional*, defaults to `4`):
-                Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour
-                whereas lower values lead to `gaussian` method behaviour.
-            spatial_stop_frequency (`float`, *optional*, defaults to `0.25`):
-                Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in
-                the original implementation.
-            temporal_stop_frequency (`float`, *optional*, defaults to `0.25`):
-                Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in
-                the original implementation.
-        """
-        self._free_init_num_iters = num_iters
-        self._free_init_use_fast_sampling = use_fast_sampling
-        self._free_init_method = method
-        self._free_init_order = order
-        self._free_init_spatial_stop_frequency = spatial_stop_frequency
-        self._free_init_temporal_stop_frequency = temporal_stop_frequency
-
-    def disable_free_init(self):
-        """Disables the FreeInit mechanism if enabled."""
-        self._free_init_num_iters = None
-
-    @property
-    def free_init_enabled(self):
-        return hasattr(self, "_free_init_num_iters") and self._free_init_num_iters is not None
-
-    def _get_free_init_freq_filter(
-        self,
-        shape: Tuple[int, ...],
-        device: Union[str, torch.dtype],
-        filter_type: str,
-        order: float,
-        spatial_stop_frequency: float,
-        temporal_stop_frequency: float,
-    ) -> torch.Tensor:
-        r"""Returns the FreeInit filter based on filter type and other input conditions."""
-
-        time, height, width = shape[-3], shape[-2], shape[-1]
-        mask = torch.zeros(shape)
-
-        if spatial_stop_frequency == 0 or temporal_stop_frequency == 0:
-            return mask
-
-        if filter_type == "butterworth":
-
-            def retrieve_mask(x):
-                return 1 / (1 + (x / spatial_stop_frequency**2) ** order)
-        elif filter_type == "gaussian":
-
-            def retrieve_mask(x):
-                return math.exp(-1 / (2 * spatial_stop_frequency**2) * x)
-        elif filter_type == "ideal":
-
-            def retrieve_mask(x):
-                return 1 if x <= spatial_stop_frequency * 2 else 0
-        else:
-            raise NotImplementedError("`filter_type` must be one of gaussian, butterworth or ideal")
-
-        for t in range(time):
-            for h in range(height):
-                for w in range(width):
-                    d_square = (
-                        ((spatial_stop_frequency / temporal_stop_frequency) * (2 * t / time - 1)) ** 2
-                        + (2 * h / height - 1) ** 2
-                        + (2 * w / width - 1) ** 2
-                    )
-                    mask[..., t, h, w] = retrieve_mask(d_square)
-
-        return mask.to(device)
-
-    def _apply_freq_filter(self, x: torch.Tensor, noise: torch.Tensor, low_pass_filter: torch.Tensor) -> torch.Tensor:
-        r"""Noise reinitialization."""
-        # FFT
-        x_freq = fft.fftn(x, dim=(-3, -2, -1))
-        x_freq = fft.fftshift(x_freq, dim=(-3, -2, -1))
-        noise_freq = fft.fftn(noise, dim=(-3, -2, -1))
-        noise_freq = fft.fftshift(noise_freq, dim=(-3, -2, -1))
-
-        # frequency mix
-        high_pass_filter = 1 - low_pass_filter
-        x_freq_low = x_freq * low_pass_filter
-        noise_freq_high = noise_freq * high_pass_filter
-        x_freq_mixed = x_freq_low + noise_freq_high  # mix in freq domain
-
-        # IFFT
-        x_freq_mixed = fft.ifftshift(x_freq_mixed, dim=(-3, -2, -1))
-        x_mixed = fft.ifftn(x_freq_mixed, dim=(-3, -2, -1)).real
-
-        return x_mixed
-
-    def _apply_free_init(
-        self,
-        latents: torch.Tensor,
-        free_init_iteration: int,
-        num_inference_steps: int,
-        device: torch.device,
-        dtype: torch.dtype,
-        generator: torch.Generator,
-    ):
-        if free_init_iteration == 0:
-            self._free_init_initial_noise = latents.detach().clone()
-            return latents, self.scheduler.timesteps
-
-        latent_shape = latents.shape
-
-        free_init_filter_shape = (1, *latent_shape[1:])
-        free_init_freq_filter = self._get_free_init_freq_filter(
-            shape=free_init_filter_shape,
-            device=device,
-            filter_type=self._free_init_method,
-            order=self._free_init_order,
-            spatial_stop_frequency=self._free_init_spatial_stop_frequency,
-            temporal_stop_frequency=self._free_init_temporal_stop_frequency,
-        )
-
-        current_diffuse_timestep = self.scheduler.config.num_train_timesteps - 1
-        diffuse_timesteps = torch.full((latent_shape[0],), current_diffuse_timestep).long()
-
-        z_t = self.scheduler.add_noise(
-            original_samples=latents, noise=self._free_init_initial_noise, timesteps=diffuse_timesteps.to(device)
-        ).to(dtype=torch.float32)
-
-        z_rand = randn_tensor(
-            shape=latent_shape,
-            generator=generator,
-            device=device,
-            dtype=torch.float32,
-        )
-        latents = self._apply_freq_filter(z_t, z_rand, low_pass_filter=free_init_freq_filter)
-        latents = latents.to(dtype)
-
-        # Coarse-to-Fine Sampling for faster inference (can lead to lower quality)
-        if self._free_init_use_fast_sampling:
-            num_inference_steps = int(num_inference_steps / self._free_init_num_iters * (free_init_iteration + 1))
-            self.scheduler.set_timesteps(num_inference_steps, device=device)
-
-        return latents, self.scheduler.timesteps
@@ -477,9 +477,8 @@ class LatentConsistencyModelImg2ImgPipeline(

            return image_embeds, uncond_image_embeds

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
    def prepare_ip_adapter_image_embeds(
-        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
+        self, ip_adapter_image, ip_adapter_image_embeds, do_classifier_free_guidance, device, num_images_per_prompt
    ):
        if ip_adapter_image_embeds is None:
            if not isinstance(ip_adapter_image, list):
@@ -503,7 +502,7 @@ class LatentConsistencyModelImg2ImgPipeline(
                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
                )

-                if self.do_classifier_free_guidance:
+                if do_classifier_free_guidance:
                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
                    single_image_embeds = single_image_embeds.to(device)

@@ -700,10 +699,6 @@ class LatentConsistencyModelImg2ImgPipeline(
    def clip_skip(self):
        return self._clip_skip

-    @property
-    def do_classifier_free_guidance(self):
-        return False
-
    @property
    def num_timesteps(self):
        return self._num_timesteps
@@ -850,7 +845,7 @@ class LatentConsistencyModelImg2ImgPipeline(

        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
            image_embeds = self.prepare_ip_adapter_image_embeds(
-                ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt
+                ip_adapter_image, ip_adapter_image_embeds, False, device, batch_size * num_images_per_prompt
            )

        # 3. Encode input prompt
@@ -865,7 +860,7 @@ class LatentConsistencyModelImg2ImgPipeline(
            prompt,
            device,
            num_images_per_prompt,
-            self.do_classifier_free_guidance,
+            False,
            negative_prompt=None,
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=None,
@@ -911,11 +906,7 @@ class LatentConsistencyModelImg2ImgPipeline(
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None)

        # 7.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = (
-            {"image_embeds": image_embeds}
-            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
-            else None
-        )
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None

        # 8. LCM Multistep Sampling Loop
        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
@@ -461,41 +461,6 @@ class LatentConsistencyModelPipeline(

            return image_embeds, uncond_image_embeds

-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds
-    def prepare_ip_adapter_image_embeds(
-        self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
-    ):
-        if ip_adapter_image_embeds is None:
-            if not isinstance(ip_adapter_image, list):
-                ip_adapter_image = [ip_adapter_image]
-
-            if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers):
-                raise ValueError(
-                    f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
-                )
-
-            image_embeds = []
-            for single_ip_adapter_image, image_proj_layer in zip(
-                ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers
-            ):
-                output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
-                single_image_embeds, single_negative_image_embeds = self.encode_image(
-                    single_ip_adapter_image, device, 1, output_hidden_state
-                )
-                single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
-                single_negative_image_embeds = torch.stack(
-                    [single_negative_image_embeds] * num_images_per_prompt, dim=0
-                )
-
-                if self.do_classifier_free_guidance:
-                    single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
-                    single_image_embeds = single_image_embeds.to(device)
-
-                image_embeds.append(single_image_embeds)
-        else:
-            image_embeds = ip_adapter_image_embeds
-        return image_embeds
-
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
    def run_safety_checker(self, image, device, dtype):
        if self.safety_checker is None:
@@ -625,10 +590,6 @@ class LatentConsistencyModelPipeline(
    def clip_skip(self):
        return self._clip_skip

-    @property
-    def do_classifier_free_guidance(self):
-        return False
-
    @property
    def num_timesteps(self):
        return self._num_timesteps
@@ -649,7 +610,6 @@ class LatentConsistencyModelPipeline(
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        ip_adapter_image: Optional[PipelineImageInput] = None,
-        ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
@@ -700,9 +660,6 @@ class LatentConsistencyModelPipeline(
                provided, text embeddings are generated from the `prompt` input argument.
            ip_adapter_image: (`PipelineImageInput`, *optional*):
                Optional image input to work with IP Adapters.
-            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. If not
-                provided, embeddings are computed from the `ip_adapter_image` input argument.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
@@ -769,10 +726,12 @@ class LatentConsistencyModelPipeline(
            batch_size = prompt_embeds.shape[0]

        device = self._execution_device
+        # do_classifier_free_guidance = guidance_scale > 1.0

-        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
-            image_embeds = self.prepare_ip_adapter_image_embeds(
-                ip_adapter_image, ip_adapter_image_embeds, device, batch_size * num_images_per_prompt
+        if ip_adapter_image is not None:
+            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
+            image_embeds, negative_image_embeds = self.encode_image(
+                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
            )

        # 3. Encode input prompt
@@ -787,7 +746,7 @@ class LatentConsistencyModelPipeline(
            prompt,
            device,
            num_images_per_prompt,
-            self.do_classifier_free_guidance,
+            False,
            negative_prompt=None,
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=None,
@@ -827,11 +786,7 @@ class LatentConsistencyModelPipeline(
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, None)

        # 7.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = (
-            {"image_embeds": image_embeds}
-            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
-            else None
-        )
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None

        # 8. LCM MultiStep Sampling Loop:
        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
@@ -45,7 +45,6 @@ from ...utils import (
    unscale_lora_layers,
 )
 from ...utils.torch_utils import randn_tensor
-from ..free_init_utils import FreeInitMixin
 from ..pipeline_utils import DiffusionPipeline


@@ -211,7 +210,7 @@ class PIAPipelineOutput(BaseOutput):


 class PIAPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin, FreeInitMixin
+    DiffusionPipeline, TextualInversionLoaderMixin, IPAdapterMixin, LoraLoaderMixin, FromSingleFileMixin
 ):
    r"""
    Pipeline for text-to-video generation.
@@ -561,6 +560,58 @@ class PIAPipeline(
        """Disables the FreeU mechanism if enabled."""
        self.unet.disable_freeu()

+    @property
+    def free_init_enabled(self):
+        return hasattr(self, "_free_init_num_iters") and self._free_init_num_iters is not None
+
+    def enable_free_init(
+        self,
+        num_iters: int = 3,
+        use_fast_sampling: bool = False,
+        method: str = "butterworth",
+        order: int = 4,
+        spatial_stop_frequency: float = 0.25,
+        temporal_stop_frequency: float = 0.25,
+        generator: Optional[torch.Generator] = None,
+    ):
+        """Enables the FreeInit mechanism as in https://arxiv.org/abs/2312.07537.
+
+        This implementation has been adapted from the [official repository](https://github.com/TianxingWu/FreeInit).
+
+        Args:
+            num_iters (`int`, *optional*, defaults to `3`):
+                Number of FreeInit noise re-initialization iterations.
+            use_fast_sampling (`bool`, *optional*, defaults to `False`):
+                Whether or not to speedup sampling procedure at the cost of probably lower quality results. Enables
+                the "Coarse-to-Fine Sampling" strategy, as mentioned in the paper, if set to `True`.
+            method (`str`, *optional*, defaults to `butterworth`):
+                Must be one of `butterworth`, `ideal` or `gaussian` to use as the filtering method for the
+                FreeInit low pass filter.
+            order (`int`, *optional*, defaults to `4`):
+                Order of the filter used in `butterworth` method. Larger values lead to `ideal` method behaviour
+                whereas lower values lead to `gaussian` method behaviour.
+            spatial_stop_frequency (`float`, *optional*, defaults to `0.25`):
+                Normalized stop frequency for spatial dimensions. Must be between 0 to 1. Referred to as `d_s` in
+                the original implementation.
+            temporal_stop_frequency (`float`, *optional*, defaults to `0.25`):
+                Normalized stop frequency for temporal dimensions. Must be between 0 to 1. Referred to as `d_t` in
+                the original implementation.
+            generator (`torch.Generator`, *optional*, defaults to `0.25`):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                FreeInit generation deterministic.
+        """
+        self._free_init_num_iters = num_iters
+        self._free_init_use_fast_sampling = use_fast_sampling
+        self._free_init_method = method
+        self._free_init_order = order
+        self._free_init_spatial_stop_frequency = spatial_stop_frequency
+        self._free_init_temporal_stop_frequency = temporal_stop_frequency
+        self._free_init_generator = generator
+
+    def disable_free_init(self):
+        """Disables the FreeInit mechanism if enabled."""
+        self._free_init_num_iters = None
+
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
    def prepare_extra_step_kwargs(self, generator, eta):
        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
@@ -744,6 +795,143 @@ class PIAPipeline(

        return mask, masked_image

+    def _denoise_loop(
+        self,
+        timesteps,
+        num_inference_steps,
+        do_classifier_free_guidance,
+        guidance_scale,
+        num_warmup_steps,
+        prompt_embeds,
+        negative_prompt_embeds,
+        latents,
+        mask,
+        masked_image,
+        cross_attention_kwargs,
+        added_cond_kwargs,
+        extra_step_kwargs,
+        callback_on_step_end,
+        callback_on_step_end_tensor_inputs,
+    ):
+        """Denoising loop for PIA."""
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                latent_model_input = torch.cat([latent_model_input, mask, masked_image], dim=1)
+
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input,
+                    t,
+                    encoder_hidden_states=prompt_embeds,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    added_cond_kwargs=added_cond_kwargs,
+                ).sample
+
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+
+        return latents
+
+    def _free_init_loop(
+        self,
+        height,
+        width,
+        num_frames,
+        batch_size,
+        num_videos_per_prompt,
+        denoise_args,
+        device,
+    ):
+        """Denoising loop for PIA using FreeInit noise reinitialization technique."""
+
+        latents = denoise_args.get("latents")
+        prompt_embeds = denoise_args.get("prompt_embeds")
+        timesteps = denoise_args.get("timesteps")
+        num_inference_steps = denoise_args.get("num_inference_steps")
+
+        latent_shape = (
+            batch_size * num_videos_per_prompt,
+            4,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        free_init_filter_shape = (
+            1,
+            4,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        free_init_freq_filter = _get_freeinit_freq_filter(
+            shape=free_init_filter_shape,
+            device=device,
+            filter_type=self._free_init_method,
+            order=self._free_init_order,
+            spatial_stop_frequency=self._free_init_spatial_stop_frequency,
+            temporal_stop_frequency=self._free_init_temporal_stop_frequency,
+        )
+
+        with self.progress_bar(total=self._free_init_num_iters) as free_init_progress_bar:
+            for i in range(self._free_init_num_iters):
+                # For the first FreeInit iteration, the original latent is used without modification.
+                # Subsequent iterations apply the noise reinitialization technique.
+                if i == 0:
+                    initial_noise = latents.detach().clone()
+                else:
+                    current_diffuse_timestep = (
+                        self.scheduler.config.num_train_timesteps - 1
+                    )  # diffuse to t=999 noise level
+                    diffuse_timesteps = torch.full((batch_size,), current_diffuse_timestep).long()
+                    z_T = self.scheduler.add_noise(
+                        original_samples=latents, noise=initial_noise, timesteps=diffuse_timesteps.to(device)
+                    ).to(dtype=torch.float32)
+                    z_rand = randn_tensor(
+                        shape=latent_shape,
+                        generator=self._free_init_generator,
+                        device=device,
+                        dtype=torch.float32,
+                    )
+                    latents = _freq_mix_3d(z_T, z_rand, LPF=free_init_freq_filter)
+                    latents = latents.to(prompt_embeds.dtype)
+
+                # Coarse-to-Fine Sampling for faster inference (can lead to lower quality)
+                if self._free_init_use_fast_sampling:
+                    current_num_inference_steps = int(num_inference_steps / self._free_init_num_iters * (i + 1))
+                    self.scheduler.set_timesteps(current_num_inference_steps, device=device)
+                    timesteps = self.scheduler.timesteps
+                    denoise_args.update({"timesteps": timesteps, "num_inference_steps": current_num_inference_steps})
+
+                num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+                denoise_args.update({"latents": latents, "num_warmup_steps": num_warmup_steps})
+                latents = self._denoise_loop(**denoise_args)
+
+                free_init_progress_bar.update()
+
+        return latents
+
    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
    def get_timesteps(self, num_inference_steps, strength, device):
        # get the original timestep using init_timestep
@@ -756,6 +944,19 @@ class PIAPipeline(

        return timesteps, num_inference_steps - t_start

+    def _retrieve_video_frames(self, latents, output_type, return_dict):
+        """Helper function to handle latents to output conversion."""
+        if output_type == "latent":
+            return PIAPipelineOutput(frames=latents)
+
+        video_tensor = self.decode_latents(latents)
+        video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+
+        if not return_dict:
+            return (video,)
+
+        return PIAPipelineOutput(frames=video)
+
    @property
    def guidance_scale(self):
        return self._guidance_scale
@@ -987,69 +1188,44 @@ class PIAPipeline(
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

        # 7. Add image embeds for IP-Adapter
-        added_cond_kwargs = (
-            {"image_embeds": image_embeds}
-            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
-            else None
-        )
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None

        # 8. Denoising loop
-        num_free_init_iters = self._free_init_num_iters if self.free_init_enabled else 1
-        for free_init_iter in range(num_free_init_iters):
-            if self.free_init_enabled:
-                latents, timesteps = self._apply_free_init(
-                    latents, free_init_iter, num_inference_steps, device, latents.dtype, generator
-                )
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        denoise_args = {
+            "timesteps": timesteps,
+            "num_inference_steps": num_inference_steps,
+            "do_classifier_free_guidance": self.do_classifier_free_guidance,
+            "guidance_scale": guidance_scale,
+            "num_warmup_steps": num_warmup_steps,
+            "prompt_embeds": prompt_embeds,
+            "negative_prompt_embeds": negative_prompt_embeds,
+            "latents": latents,
+            "mask": mask,
+            "masked_image": masked_image,
+            "cross_attention_kwargs": self.cross_attention_kwargs,
+            "added_cond_kwargs": added_cond_kwargs,
+            "extra_step_kwargs": extra_step_kwargs,
+            "callback_on_step_end": callback_on_step_end,
+            "callback_on_step_end_tensor_inputs": callback_on_step_end_tensor_inputs,
+        }

-            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-            with self.progress_bar(total=num_inference_steps) as progress_bar:
-                for i, t in enumerate(timesteps):
-                    # expand the latents if we are doing classifier free guidance
-                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
-                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                    latent_model_input = torch.cat([latent_model_input, mask, masked_image], dim=1)
+        if self.free_init_enabled:
+            latents = self._free_init_loop(
+                height=height,
+                width=width,
+                num_frames=num_frames,
+                batch_size=batch_size,
+                num_videos_per_prompt=num_videos_per_prompt,
+                denoise_args=denoise_args,
+                device=device,
+            )
+        else:
+            latents = self._denoise_loop(**denoise_args)

-                    # predict the noise residual
-                    noise_pred = self.unet(
-                        latent_model_input,
-                        t,
-                        encoder_hidden_states=prompt_embeds,
-                        cross_attention_kwargs=cross_attention_kwargs,
-                        added_cond_kwargs=added_cond_kwargs,
-                    ).sample
-
-                    # perform guidance
-                    if self.do_classifier_free_guidance:
-                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                    # compute the previous noisy sample x_t -> x_t-1
-                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
-
-                    if callback_on_step_end is not None:
-                        callback_kwargs = {}
-                        for k in callback_on_step_end_tensor_inputs:
-                            callback_kwargs[k] = locals()[k]
-                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                        latents = callback_outputs.pop("latents", latents)
-                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
-                    # call the callback, if provided
-                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                        progress_bar.update()
-
-        if output_type == "latent":
-            return PIAPipelineOutput(frames=latents)
-
-        video_tensor = self.decode_latents(latents)
-        video = tensor2vid(video_tensor, self.image_processor, output_type=output_type)
+        video = self._retrieve_video_frames(latents, output_type, return_dict)

        # 9. Offload all models
        self.maybe_free_model_hooks()

-        if not return_dict:
-            return (video,)
-
-        return PIAPipelineOutput(frames=video)
+        return video
@@ -1111,11 +1111,7 @@ class StableDiffusionImg2ImgPipeline(
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

        # 7.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = (
-            {"image_embeds": image_embeds}
-            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
-            else None
-        )
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None

        # 7.2 Optionally get Guidance Scale Embedding
        timestep_cond = None
@@ -1397,11 +1397,7 @@ class StableDiffusionInpaintPipeline(
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

        # 9.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = (
-            {"image_embeds": image_embeds}
-            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
-            else None
-        )
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None

        # 9.2 Optionally get Guidance Scale Embedding
        timestep_cond = None
@@ -553,15 +553,13 @@ class StableDiffusionInstructPix2PixPipeline(
            else:
                attention_mask = None

-            prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
+            prompt_embeds = self.text_encoder(
+                text_input_ids.to(device),
+                attention_mask=attention_mask,
+            )
            prompt_embeds = prompt_embeds[0]

-        if self.text_encoder is not None:
-            prompt_embeds_dtype = self.text_encoder.dtype
-        else:
-            prompt_embeds_dtype = self.unet.dtype
-
-        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)

        bs_embed, seq_len, _ = prompt_embeds.shape
        # duplicate text embeddings for each generation per prompt, using mps friendly method
@@ -617,7 +615,7 @@ class StableDiffusionInstructPix2PixPipeline(
            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
            seq_len = negative_prompt_embeds.shape[1]

-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
+            negative_prompt_embeds = negative_prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)

            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
@@ -777,11 +777,7 @@ class StableDiffusionPanoramaPipeline(DiffusionPipeline, TextualInversionLoaderM
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

        # 7.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = (
-            {"image_embeds": image_embeds}
-            if ip_adapter_image is not None or ip_adapter_image_embeds is not None
-            else None
-        )
+        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None

        # 8. Denoising loop
        # Each denoising step also includes refinement of the latents with respect to the
@@ -132,15 +132,15 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):
            image = _resize_with_antialiasing(image, (224, 224))
            image = (image + 1.0) / 2.0

-        # Normalize the image with for CLIP input
-        image = self.feature_extractor(
-            images=image,
-            do_normalize=True,
-            do_center_crop=False,
-            do_resize=False,
-            do_rescale=False,
-            return_tensors="pt",
-        ).pixel_values
+            # Normalize the image with for CLIP input
+            image = self.feature_extractor(
+                images=image,
+                do_normalize=True,
+                do_center_crop=False,
+                do_resize=False,
+                do_rescale=False,
+                return_tensors="pt",
+            ).pixel_values

        image = image.to(device=device, dtype=dtype)
        image_embeddings = self.image_encoder(image).image_embeds
@@ -333,7 +333,8 @@ class StableVideoDiffusionPipeline(DiffusionPipeline):

        Args:
            image (`PIL.Image.Image` or `List[PIL.Image.Image]` or `torch.FloatTensor`):
-                Image or images to guide image generation. If you provide a tensor, the expected value range is between `[0,1]`.
+                Image or images to guide image generation. If you provide a tensor, it needs to be compatible with
+                [`CLIPImageProcessor`](https://huggingface.co/lambdalabs/sd-image-variations-diffusers/blob/main/feature_extractor/preprocessor_config.json).
            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
                The height in pixels of the generated image.
            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
@@ -21,7 +21,7 @@ from typing import Any, Tuple

 import numpy as np

-from .import_utils import is_torch_available, is_torch_version
+from .import_utils import is_torch_available


 def is_tensor(x) -> bool:
@@ -60,18 +60,11 @@ class BaseOutput(OrderedDict):
        if is_torch_available():
            import torch.utils._pytree

-            if is_torch_version("<", "2.2"):
-                torch.utils._pytree._register_pytree_node(
-                    cls,
-                    torch.utils._pytree._dict_flatten,
-                    lambda values, context: cls(**torch.utils._pytree._dict_unflatten(values, context)),
-                )
-            else:
-                torch.utils._pytree.register_pytree_node(
-                    cls,
-                    torch.utils._pytree._dict_flatten,
-                    lambda values, context: cls(**torch.utils._pytree._dict_unflatten(values, context)),
-                )
+            torch.utils._pytree._register_pytree_node(
+                cls,
+                torch.utils._pytree._dict_flatten,
+                lambda values, context: cls(**torch.utils._pytree._dict_unflatten(values, context)),
+            )

    def __post_init__(self) -> None:
        class_fields = fields(self)
@@ -62,10 +62,7 @@ def create_ip_adapter_state_dict(model):
    key_id = 1

    for name in model.attn_processors.keys():
-        cross_attention_dim = (
-            None if name.endswith("attn1.processor") or "motion_module" in name else model.config.cross_attention_dim
-        )
-
+        cross_attention_dim = None if name.endswith("attn1.processor") else model.config.cross_attention_dim
        if name.startswith("mid_block"):
            hidden_size = model.config.block_out_channels[-1]
        elif name.startswith("up_blocks"):
@@ -74,7 +71,6 @@ def create_ip_adapter_state_dict(model):
        elif name.startswith("down_blocks"):
            block_id = int(name[len("down_blocks.")])
            hidden_size = model.config.block_out_channels[block_id]
-
        if cross_attention_dim is not None:
            sd = IPAdapterAttnProcessor(
                hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
@@ -18,7 +18,7 @@ from diffusers.utils import is_xformers_available, logging
 from diffusers.utils.testing_utils import numpy_cosine_similarity_distance, require_torch_gpu, slow, torch_device

 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import IPAdapterTesterMixin, PipelineTesterMixin
+from ..test_pipelines_common import PipelineTesterMixin


 def to_np(tensor):
@@ -28,7 +28,7 @@ def to_np(tensor):
    return tensor


-class AnimateDiffPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class AnimateDiffPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    pipeline_class = AnimateDiffPipeline
    params = TEXT_TO_IMAGE_PARAMS
    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
@@ -242,6 +242,7 @@ class AnimateDiffPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, un
        inputs_normal = self.get_dummy_inputs(torch_device)
        frames_normal = pipe(**inputs_normal).frames[0]

+        free_init_generator = torch.Generator(device=torch_device).manual_seed(0)
        pipe.enable_free_init(
            num_iters=2,
            use_fast_sampling=True,
@@ -249,6 +250,7 @@ class AnimateDiffPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, un
            order=4,
            spatial_stop_frequency=0.25,
            temporal_stop_frequency=0.25,
+            generator=free_init_generator,
        )
        inputs_enable_free_init = self.get_dummy_inputs(torch_device)
        frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0]
@@ -18,7 +18,7 @@ from diffusers.utils import is_xformers_available, logging
 from diffusers.utils.testing_utils import torch_device

 from ..pipeline_params import TEXT_TO_IMAGE_PARAMS, VIDEO_TO_VIDEO_BATCH_PARAMS
-from ..test_pipelines_common import IPAdapterTesterMixin, PipelineTesterMixin
+from ..test_pipelines_common import PipelineTesterMixin


 def to_np(tensor):
@@ -28,7 +28,7 @@ def to_np(tensor):
    return tensor


-class AnimateDiffVideoToVideoPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class AnimateDiffVideoToVideoPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    pipeline_class = AnimateDiffVideoToVideoPipeline
    params = TEXT_TO_IMAGE_PARAMS
    batch_params = VIDEO_TO_VIDEO_BATCH_PARAMS
@@ -267,38 +267,3 @@ class AnimateDiffVideoToVideoPipelineFastTests(IPAdapterTesterMixin, PipelineTes

        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
        self.assertLess(max_diff, 1e-4, "XFormers attention should not affect the inference results")
-
-    def test_free_init(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.set_progress_bar_config(disable=None)
-        pipe.to(torch_device)
-
-        inputs_normal = self.get_dummy_inputs(torch_device)
-        frames_normal = pipe(**inputs_normal).frames[0]
-
-        pipe.enable_free_init(
-            num_iters=2,
-            use_fast_sampling=True,
-            method="butterworth",
-            order=4,
-            spatial_stop_frequency=0.25,
-            temporal_stop_frequency=0.25,
-        )
-        inputs_enable_free_init = self.get_dummy_inputs(torch_device)
-        frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0]
-
-        pipe.disable_free_init()
-        inputs_disable_free_init = self.get_dummy_inputs(torch_device)
-        frames_disable_free_init = pipe(**inputs_disable_free_init).frames[0]
-
-        sum_enabled = np.abs(to_np(frames_normal) - to_np(frames_enable_free_init)).sum()
-        max_diff_disabled = np.abs(to_np(frames_normal) - to_np(frames_disable_free_init)).max()
-        self.assertGreater(
-            sum_enabled, 1e1, "Enabling of FreeInit should lead to results different from the default pipeline results"
-        )
-        self.assertLess(
-            max_diff_disabled,
-            1e-4,
-            "Disabling of FreeInit should lead to results similar to the default pipeline results",
-        )
@@ -54,7 +54,6 @@ from ..pipeline_params import (
    TEXT_TO_IMAGE_PARAMS,
 )
 from ..test_pipelines_common import (
-    IPAdapterTesterMixin,
    PipelineKarrasSchedulerTesterMixin,
    PipelineLatentTesterMixin,
    PipelineTesterMixin,
@@ -111,11 +110,7 @@ def _test_stable_diffusion_compile(in_queue, out_queue, timeout):


 class ControlNetPipelineFastTests(
-    IPAdapterTesterMixin,
-    PipelineLatentTesterMixin,
-    PipelineKarrasSchedulerTesterMixin,
-    PipelineTesterMixin,
-    unittest.TestCase,
+    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
 ):
    pipeline_class = StableDiffusionControlNetPipeline
    params = TEXT_TO_IMAGE_PARAMS
@@ -278,7 +273,7 @@ class ControlNetPipelineFastTests(


 class StableDiffusionMultiControlNetPipelineFastTests(
-    IPAdapterTesterMixin, PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
+    PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
 ):
    pipeline_class = StableDiffusionControlNetPipeline
    params = TEXT_TO_IMAGE_PARAMS
@@ -495,7 +490,7 @@ class StableDiffusionMultiControlNetPipelineFastTests(


 class StableDiffusionMultiControlNetOneModelPipelineFastTests(
-    IPAdapterTesterMixin, PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
+    PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
 ):
    pipeline_class = StableDiffusionControlNetPipeline
    params = TEXT_TO_IMAGE_PARAMS
@@ -52,7 +52,6 @@ from ..pipeline_params import (
    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
 )
 from ..test_pipelines_common import (
-    IPAdapterTesterMixin,
    PipelineKarrasSchedulerTesterMixin,
    PipelineLatentTesterMixin,
    PipelineTesterMixin,
@@ -63,11 +62,7 @@ enable_full_determinism()


 class ControlNetImg2ImgPipelineFastTests(
-    IPAdapterTesterMixin,
-    PipelineLatentTesterMixin,
-    PipelineKarrasSchedulerTesterMixin,
-    PipelineTesterMixin,
-    unittest.TestCase,
+    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
 ):
    pipeline_class = StableDiffusionControlNetImg2ImgPipeline
    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
@@ -186,7 +181,7 @@ class ControlNetImg2ImgPipelineFastTests(


 class StableDiffusionMultiControlNetPipelineFastTests(
-    IPAdapterTesterMixin, PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
+    PipelineTesterMixin, PipelineKarrasSchedulerTesterMixin, unittest.TestCase
 ):
    pipeline_class = StableDiffusionControlNetImg2ImgPipeline
    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
@@ -51,7 +51,11 @@ from ..pipeline_params import (
    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
    TEXT_TO_IMAGE_IMAGE_PARAMS,
 )
-from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
+from ..test_pipelines_common import (
+    PipelineKarrasSchedulerTesterMixin,
+    PipelineLatentTesterMixin,
+    PipelineTesterMixin,
+)


 enable_full_determinism()
@@ -555,16 +559,17 @@ class ControlNetInpaintPipelineSlowTests(unittest.TestCase):
    def test_load_local(self):
        controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_canny")
        pipe_1 = StableDiffusionControlNetInpaintPipeline.from_pretrained(
-            "runwayml/stable-diffusion-inpainting", safety_checker=None, controlnet=controlnet
+            "runwayml/stable-diffusion-v1-5", safety_checker=None, controlnet=controlnet
        )

        controlnet = ControlNetModel.from_single_file(
            "https://huggingface.co/lllyasviel/ControlNet-v1-1/blob/main/control_v11p_sd15_canny.pth"
        )
        pipe_2 = StableDiffusionControlNetInpaintPipeline.from_single_file(
-            "https://huggingface.co/runwayml/stable-diffusion-inpainting/blob/main/sd-v1-5-inpainting.ckpt",
+            "https://huggingface.co/runwayml/stable-diffusion-v1-5/blob/main/v1-5-pruned-emaonly.safetensors",
            safety_checker=None,
            controlnet=controlnet,
+            scheduler_type="pndm",
        )
        control_image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
@@ -48,7 +48,6 @@ from ..pipeline_params import (
    TEXT_TO_IMAGE_PARAMS,
 )
 from ..test_pipelines_common import (
-    IPAdapterTesterMixin,
    PipelineKarrasSchedulerTesterMixin,
    PipelineLatentTesterMixin,
    PipelineTesterMixin,
@@ -60,7 +59,6 @@ enable_full_determinism()


 class StableDiffusionXLControlNetPipelineFastTests(
-    IPAdapterTesterMixin,
    PipelineLatentTesterMixin,
    PipelineKarrasSchedulerTesterMixin,
    PipelineTesterMixin,
@@ -36,7 +36,6 @@ from ..pipeline_params import (
    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
 )
 from ..test_pipelines_common import (
-    IPAdapterTesterMixin,
    PipelineKarrasSchedulerTesterMixin,
    PipelineLatentTesterMixin,
    PipelineTesterMixin,
@@ -47,11 +46,7 @@ enable_full_determinism()


 class ControlNetPipelineSDXLImg2ImgFastTests(
-    IPAdapterTesterMixin,
-    PipelineLatentTesterMixin,
-    PipelineKarrasSchedulerTesterMixin,
-    PipelineTesterMixin,
-    unittest.TestCase,
+    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
 ):
    pipeline_class = StableDiffusionXLControlNetImg2ImgPipeline
    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
@@ -31,7 +31,6 @@ from diffusers import (
    StableDiffusionXLInpaintPipeline,
    StableDiffusionXLPipeline,
 )
-from diffusers.image_processor import IPAdapterMaskProcessor
 from diffusers.models.attention_processor import AttnProcessor, AttnProcessor2_0
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
@@ -65,7 +64,7 @@ class IPAdapterNightlyTestsMixin(unittest.TestCase):
        image_processor = CLIPImageProcessor.from_pretrained(repo_id)
        return image_processor

-    def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False, for_sdxl=False, for_masks=False):
+    def get_dummy_inputs(self, for_image_to_image=False, for_inpainting=False, for_sdxl=False):
        image = load_image(
            "https://user-images.githubusercontent.com/24734142/266492875-2d50d223-8475-44f0-a7c6-08b51cb53572.png"
        )
@@ -102,22 +101,6 @@ class IPAdapterNightlyTestsMixin(unittest.TestCase):

            input_kwargs.update({"image": image, "mask_image": mask, "ip_adapter_image": ip_image})

-        elif for_masks:
-            face_image1 = load_image(
-                "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_girl1.png"
-            )
-            face_image2 = load_image(
-                "https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_girl2.png"
-            )
-            mask1 = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_mask1.png")
-            mask2 = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip_mask_mask2.png")
-            input_kwargs.update(
-                {
-                    "ip_adapter_image": [[face_image1], [face_image2]],
-                    "cross_attention_kwargs": {"ip_adapter_masks": [mask1, mask2]},
-                }
-            )
-
        return input_kwargs


@@ -482,58 +465,3 @@ class IPAdapterSDXLIntegrationTests(IPAdapterNightlyTestsMixin):

        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
        assert max_diff < 5e-4
-
-    def test_ip_adapter_single_mask(self):
-        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
-        pipeline = StableDiffusionXLPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0",
-            image_encoder=image_encoder,
-            torch_dtype=self.dtype,
-        )
-        pipeline.to(torch_device)
-        pipeline.load_ip_adapter(
-            "h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter-plus-face_sdxl_vit-h.safetensors"
-        )
-        pipeline.set_ip_adapter_scale(0.7)
-
-        inputs = self.get_dummy_inputs(for_masks=True)
-        mask = inputs["cross_attention_kwargs"]["ip_adapter_masks"][0]
-        processor = IPAdapterMaskProcessor()
-        mask = processor.preprocess(mask)
-        inputs["cross_attention_kwargs"]["ip_adapter_masks"] = mask
-        inputs["ip_adapter_image"] = inputs["ip_adapter_image"][0]
-        images = pipeline(**inputs).images
-        image_slice = images[0, :3, :3, -1].flatten()
-        expected_slice = np.array(
-            [0.7307304, 0.73450166, 0.73731124, 0.7377061, 0.7318013, 0.73720926, 0.74746597, 0.7409929, 0.74074936]
-        )
-
-        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
-        assert max_diff < 5e-4
-
-    def test_ip_adapter_multiple_masks(self):
-        image_encoder = self.get_image_encoder(repo_id="h94/IP-Adapter", subfolder="models/image_encoder")
-        pipeline = StableDiffusionXLPipeline.from_pretrained(
-            "stabilityai/stable-diffusion-xl-base-1.0",
-            image_encoder=image_encoder,
-            torch_dtype=self.dtype,
-        )
-        pipeline.to(torch_device)
-        pipeline.load_ip_adapter(
-            "h94/IP-Adapter", subfolder="sdxl_models", weight_name=["ip-adapter-plus-face_sdxl_vit-h.safetensors"] * 2
-        )
-        pipeline.set_ip_adapter_scale([0.7] * 2)
-
-        inputs = self.get_dummy_inputs(for_masks=True)
-        masks = inputs["cross_attention_kwargs"]["ip_adapter_masks"]
-        processor = IPAdapterMaskProcessor()
-        masks = processor.preprocess(masks)
-        inputs["cross_attention_kwargs"]["ip_adapter_masks"] = masks
-        images = pipeline(**inputs).images
-        image_slice = images[0, :3, :3, -1].flatten()
-        expected_slice = np.array(
-            [0.79474676, 0.7977683, 0.8013954, 0.7988008, 0.7970615, 0.8029355, 0.80614823, 0.8050743, 0.80627424]
-        )
-
-        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
-        assert max_diff < 5e-4
@@ -20,15 +20,13 @@ from diffusers.utils.testing_utils import (
 )

 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import IPAdapterTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin


 enable_full_determinism()


-class LatentConsistencyModelPipelineFastTests(
-    IPAdapterTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase
-):
+class LatentConsistencyModelPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
    pipeline_class = LatentConsistencyModelPipeline
    params = TEXT_TO_IMAGE_PARAMS - {"negative_prompt", "negative_prompt_embeds"}
    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS - {"negative_prompt"}
@@ -27,14 +27,14 @@ from ..pipeline_params import (
    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
 )
-from ..test_pipelines_common import IPAdapterTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin


 enable_full_determinism()


 class LatentConsistencyModelImg2ImgPipelineFastTests(
-    IPAdapterTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase
+    PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase
 ):
    pipeline_class = LatentConsistencyModelImg2ImgPipeline
    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width", "negative_prompt", "negative_prompt_embeds"}
@@ -17,7 +17,7 @@ from diffusers import (
 from diffusers.utils import is_xformers_available, logging
 from diffusers.utils.testing_utils import floats_tensor, torch_device

-from ..test_pipelines_common import IPAdapterTesterMixin, PipelineTesterMixin
+from ..test_pipelines_common import PipelineTesterMixin


 def to_np(tensor):
@@ -27,7 +27,7 @@ def to_np(tensor):
    return tensor


-class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, unittest.TestCase):
+class PIAPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
    pipeline_class = PIAPipeline
    params = frozenset(
        [
@@ -255,6 +255,7 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, unittest.T
        inputs_normal = self.get_dummy_inputs(torch_device)
        frames_normal = pipe(**inputs_normal).frames[0]

+        free_init_generator = torch.Generator(device=torch_device).manual_seed(0)
        pipe.enable_free_init(
            num_iters=2,
            use_fast_sampling=True,
@@ -262,6 +263,7 @@ class PIAPipelineFastTests(IPAdapterTesterMixin, PipelineTesterMixin, unittest.T
            order=4,
            spatial_stop_frequency=0.25,
            temporal_stop_frequency=0.25,
+            generator=free_init_generator,
        )
        inputs_enable_free_init = self.get_dummy_inputs(torch_device)
        frames_enable_free_init = pipe(**inputs_enable_free_init).frames[0]
@@ -27,13 +27,7 @@ from diffusers import (
    PixArtAlphaPipeline,
    Transformer2DModel,
 )
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    numpy_cosine_similarity_distance,
-    require_torch_gpu,
-    slow,
-    torch_device,
-)
+from diffusers.utils.testing_utils import enable_full_determinism, require_torch_gpu, slow, torch_device

 from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
 from ..test_pipelines_common import PipelineTesterMixin, to_np
@@ -338,35 +332,37 @@ class PixArtAlphaPipelineIntegrationTests(unittest.TestCase):
        torch.cuda.empty_cache()

    def test_pixart_1024(self):
-        generator = torch.Generator("cpu").manual_seed(0)
+        generator = torch.manual_seed(0)

        pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_1024, torch_dtype=torch.float16)
        pipe.enable_model_cpu_offload()
        prompt = self.prompt

-        image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images
+        image = pipe(prompt, generator=generator, output_type="np").images

        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array([0.0742, 0.0835, 0.2114, 0.0295, 0.0784, 0.2361, 0.1738, 0.2251, 0.3589])

-        max_diff = numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice)
-        self.assertLessEqual(max_diff, 1e-4)
+        expected_slice = np.array([0.1941, 0.2117, 0.2188, 0.1946, 0.218, 0.2124, 0.199, 0.2437, 0.2583])
+
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)

    def test_pixart_512(self):
-        generator = torch.Generator("cpu").manual_seed(0)
+        generator = torch.manual_seed(0)

        pipe = PixArtAlphaPipeline.from_pretrained(self.ckpt_id_512, torch_dtype=torch.float16)
        pipe.enable_model_cpu_offload()

        prompt = self.prompt

-        image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images
+        image = pipe(prompt, generator=generator, output_type="np").images

        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array([0.3477, 0.3882, 0.4541, 0.3413, 0.3821, 0.4463, 0.4001, 0.4409, 0.4958])

-        max_diff = numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice)
-        self.assertLessEqual(max_diff, 1e-4)
+        expected_slice = np.array([0.2637, 0.291, 0.2939, 0.207, 0.2512, 0.2783, 0.2168, 0.2324, 0.2817])
+
+        max_diff = np.abs(image_slice.flatten() - expected_slice).max()
+        self.assertLessEqual(max_diff, 1e-3)

    def test_pixart_1024_without_resolution_binning(self):
        generator = torch.manual_seed(0)
@@ -376,7 +372,7 @@ class PixArtAlphaPipelineIntegrationTests(unittest.TestCase):

        prompt = self.prompt
        height, width = 1024, 768
-        num_inference_steps = 2
+        num_inference_steps = 10

        image = pipe(
            prompt,
@@ -410,7 +406,7 @@ class PixArtAlphaPipelineIntegrationTests(unittest.TestCase):

        prompt = self.prompt
        height, width = 512, 768
-        num_inference_steps = 2
+        num_inference_steps = 10

        image = pipe(
            prompt,
@@ -23,11 +23,7 @@ import unittest
 import numpy as np
 import torch
 from huggingface_hub import hf_hub_download
-from transformers import (
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTokenizer,
-)
+from transformers import CLIPTextConfig, CLIPTextModel, CLIPTokenizer

 from diffusers import (
    AutoencoderKL,
@@ -64,12 +60,7 @@ from ..pipeline_params import (
    TEXT_TO_IMAGE_IMAGE_PARAMS,
    TEXT_TO_IMAGE_PARAMS,
 )
-from ..test_pipelines_common import (
-    IPAdapterTesterMixin,
-    PipelineKarrasSchedulerTesterMixin,
-    PipelineLatentTesterMixin,
-    PipelineTesterMixin,
-)
+from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin


 enable_full_determinism()
@@ -109,11 +100,7 @@ def _test_stable_diffusion_compile(in_queue, out_queue, timeout):


 class StableDiffusionPipelineFastTests(
-    IPAdapterTesterMixin,
-    PipelineLatentTesterMixin,
-    PipelineKarrasSchedulerTesterMixin,
-    PipelineTesterMixin,
-    unittest.TestCase,
+    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
 ):
    pipeline_class = StableDiffusionPipeline
    params = TEXT_TO_IMAGE_PARAMS
@@ -190,7 +177,7 @@ class StableDiffusionPipelineFastTests(
            "generator": generator,
            "num_inference_steps": 2,
            "guidance_scale": 6.0,
-            "output_type": "np",
+            "output_type": "numpy",
        }
        return inputs

@@ -55,12 +55,7 @@ from ..pipeline_params import (
    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
 )
-from ..test_pipelines_common import (
-    IPAdapterTesterMixin,
-    PipelineKarrasSchedulerTesterMixin,
-    PipelineLatentTesterMixin,
-    PipelineTesterMixin,
-)
+from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin


 enable_full_determinism()
@@ -99,11 +94,7 @@ def _test_img2img_compile(in_queue, out_queue, timeout):


 class StableDiffusionImg2ImgPipelineFastTests(
-    IPAdapterTesterMixin,
-    PipelineLatentTesterMixin,
-    PipelineKarrasSchedulerTesterMixin,
-    PipelineTesterMixin,
-    unittest.TestCase,
+    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
 ):
    pipeline_class = StableDiffusionImg2ImgPipeline
    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
@@ -57,12 +57,7 @@ from ..pipeline_params import (
    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
 )
-from ..test_pipelines_common import (
-    IPAdapterTesterMixin,
-    PipelineKarrasSchedulerTesterMixin,
-    PipelineLatentTesterMixin,
-    PipelineTesterMixin,
-)
+from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin


 enable_full_determinism()
@@ -103,11 +98,7 @@ def _test_inpaint_compile(in_queue, out_queue, timeout):


 class StableDiffusionInpaintPipelineFastTests(
-    IPAdapterTesterMixin,
-    PipelineLatentTesterMixin,
-    PipelineKarrasSchedulerTesterMixin,
-    PipelineTesterMixin,
-    unittest.TestCase,
+    PipelineLatentTesterMixin, PipelineKarrasSchedulerTesterMixin, PipelineTesterMixin, unittest.TestCase
 ):
    pipeline_class = StableDiffusionInpaintPipeline
    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
@@ -47,11 +47,7 @@ from ..pipeline_params import (
    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
 )
-from ..test_pipelines_common import (
-    PipelineKarrasSchedulerTesterMixin,
-    PipelineLatentTesterMixin,
-    PipelineTesterMixin,
-)
+from ..test_pipelines_common import PipelineKarrasSchedulerTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin


 enable_full_determinism()
@@ -49,23 +49,14 @@ from ..pipeline_params import (
    TEXT_TO_IMAGE_IMAGE_PARAMS,
    TEXT_TO_IMAGE_PARAMS,
 )
-from ..test_pipelines_common import (
-    IPAdapterTesterMixin,
-    PipelineLatentTesterMixin,
-    PipelineTesterMixin,
-    SDXLOptionalComponentsTesterMixin,
-)
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin, SDXLOptionalComponentsTesterMixin


 enable_full_determinism()


 class StableDiffusionXLPipelineFastTests(
-    IPAdapterTesterMixin,
-    PipelineLatentTesterMixin,
-    PipelineTesterMixin,
-    SDXLOptionalComponentsTesterMixin,
-    unittest.TestCase,
+    PipelineLatentTesterMixin, PipelineTesterMixin, SDXLOptionalComponentsTesterMixin, unittest.TestCase
 ):
    pipeline_class = StableDiffusionXLPipeline
    params = TEXT_TO_IMAGE_PARAMS
@@ -44,7 +44,6 @@ from diffusers.utils.testing_utils import (

 from ..pipeline_params import TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS, TEXT_GUIDED_IMAGE_VARIATION_PARAMS
 from ..test_pipelines_common import (
-    IPAdapterTesterMixin,
    PipelineTesterMixin,
    SDXLOptionalComponentsTesterMixin,
    assert_mean_pixel_difference,
@@ -55,7 +54,7 @@ enable_full_determinism()


 class StableDiffusionXLAdapterPipelineFastTests(
-    IPAdapterTesterMixin, PipelineTesterMixin, SDXLOptionalComponentsTesterMixin, unittest.TestCase
+    PipelineTesterMixin, SDXLOptionalComponentsTesterMixin, unittest.TestCase
 ):
    pipeline_class = StableDiffusionXLAdapterPipeline
    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
@@ -54,20 +54,13 @@ from ..pipeline_params import (
    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
 )
-from ..test_pipelines_common import (
-    IPAdapterTesterMixin,
-    PipelineLatentTesterMixin,
-    PipelineTesterMixin,
-    SDXLOptionalComponentsTesterMixin,
-)
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin, SDXLOptionalComponentsTesterMixin


 enable_full_determinism()


-class StableDiffusionXLImg2ImgPipelineFastTests(
-    IPAdapterTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase
-):
+class StableDiffusionXLImg2ImgPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
    pipeline_class = StableDiffusionXLImg2ImgPipeline
    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS - {"height", "width"}
    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
@@ -48,15 +48,13 @@ from ..pipeline_params import (
    TEXT_GUIDED_IMAGE_INPAINTING_PARAMS,
    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
 )
-from ..test_pipelines_common import IPAdapterTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin
+from ..test_pipelines_common import PipelineLatentTesterMixin, PipelineTesterMixin


 enable_full_determinism()


-class StableDiffusionXLInpaintPipelineFastTests(
-    IPAdapterTesterMixin, PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase
-):
+class StableDiffusionXLInpaintPipelineFastTests(PipelineLatentTesterMixin, PipelineTesterMixin, unittest.TestCase):
    pipeline_class = StableDiffusionXLInpaintPipeline
    params = TEXT_GUIDED_IMAGE_INPAINTING_PARAMS
    batch_params = TEXT_GUIDED_IMAGE_INPAINTING_BATCH_PARAMS
@@ -22,6 +22,7 @@ from diffusers.utils import is_accelerate_available, is_accelerate_version, load
 from diffusers.utils.import_utils import is_xformers_available
 from diffusers.utils.testing_utils import (
    CaptureLogger,
+    disable_full_determinism,
    enable_full_determinism,
    floats_tensor,
    numpy_cosine_similarity_distance,
@@ -33,9 +34,6 @@ from diffusers.utils.testing_utils import (
 from ..test_pipelines_common import PipelineTesterMixin


-enable_full_determinism()
-
-
 def to_np(tensor):
    if isinstance(tensor, torch.Tensor):
        tensor = tensor.detach().cpu().numpy()
@@ -467,6 +465,8 @@ class StableVideoDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCa
        reason="XFormers attention is only available with CUDA and `xformers` installed",
    )
    def test_xformers_attention_forwardGenerator_pass(self):
+        disable_full_determinism()
+
        expected_max_diff = 9e-4

        if not self.test_xformers_attention:
@@ -496,6 +496,8 @@ class StableVideoDiffusionPipelineFastTests(PipelineTesterMixin, unittest.TestCa
        max_diff = np.abs(to_np(output_with_offload) - to_np(output_without_offload)).max()
        self.assertLess(max_diff, expected_max_diff, "XFormers attention should not affect the inference results")

+        enable_full_determinism()
+

@slow
@require_torch_gpu
@@ -8,7 +8,7 @@ import re
 import tempfile
 import unittest
 import uuid
-from typing import Any, Callable, Dict, Union
+from typing import Callable, Union

 import numpy as np
 import PIL.Image
@@ -29,7 +29,6 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.image_processor import VaeImageProcessor
-from diffusers.loaders import IPAdapterMixin
 from diffusers.schedulers import KarrasDiffusionSchedulers
 from diffusers.utils import logging
 from diffusers.utils.import_utils import is_accelerate_available, is_accelerate_version, is_xformers_available
@@ -45,7 +44,6 @@ from ..models.autoencoders.test_models_vae import (
    get_autoencoder_tiny_config,
    get_consistency_vae_config,
 )
-from ..models.unets.test_models_unet_2d_condition import create_ip_adapter_state_dict
 from ..others.test_utils import TOKEN, USER, is_staging_test


@@ -61,118 +59,6 @@ def check_same_shape(tensor_list):
    return all(shape == shapes[0] for shape in shapes[1:])


-class IPAdapterTesterMixin:
-    """
-    This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes.
-    It provides a set of common tests for pipelines that support IP Adapters.
-    """
-
-    def test_pipeline_signature(self):
-        parameters = inspect.signature(self.pipeline_class.__call__).parameters
-
-        assert issubclass(self.pipeline_class, IPAdapterMixin)
-        self.assertIn(
-            "ip_adapter_image",
-            parameters,
-            "`ip_adapter_image` argument must be supported by the `__call__` method",
-        )
-        self.assertIn(
-            "ip_adapter_image_embeds",
-            parameters,
-            "`ip_adapter_image_embeds` argument must be supported by the `__call__` method",
-        )
-
-    def _get_dummy_image_embeds(self, cross_attention_dim: int = 32):
-        return torch.randn((2, 1, cross_attention_dim), device=torch_device)
-
-    def _modify_inputs_for_ip_adapter_test(self, inputs: Dict[str, Any]):
-        parameters = inspect.signature(self.pipeline_class.__call__).parameters
-        if "image" in parameters.keys() and "strength" in parameters.keys():
-            inputs["num_inference_steps"] = 4
-
-        inputs["output_type"] = "np"
-        inputs["return_dict"] = False
-        return inputs
-
-    def test_ip_adapter_single(self, expected_max_diff: float = 1e-4):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components).to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        cross_attention_dim = pipe.unet.config.get("cross_attention_dim", 32)
-
-        # forward pass without ip adapter
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
-        output_without_adapter = pipe(**inputs)[0]
-
-        adapter_state_dict = create_ip_adapter_state_dict(pipe.unet)
-        pipe.unet._load_ip_adapter_weights(adapter_state_dict)
-
-        # forward pass with single ip adapter, but scale=0 which should have no effect
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
-        inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
-        pipe.set_ip_adapter_scale(0.0)
-        output_without_adapter_scale = pipe(**inputs)[0]
-
-        # forward pass with single ip adapter, but with scale of adapter weights
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
-        inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)]
-        pipe.set_ip_adapter_scale(42.0)
-        output_with_adapter_scale = pipe(**inputs)[0]
-
-        max_diff_without_adapter_scale = np.abs(output_without_adapter_scale - output_without_adapter).max()
-        max_diff_with_adapter_scale = np.abs(output_with_adapter_scale - output_without_adapter).max()
-
-        self.assertLess(
-            max_diff_without_adapter_scale,
-            expected_max_diff,
-            "Output without ip-adapter must be same as normal inference",
-        )
-        self.assertGreater(
-            max_diff_with_adapter_scale, 1e-2, "Output with ip-adapter must be different from normal inference"
-        )
-
-    def test_ip_adapter_multi(self, expected_max_diff: float = 1e-4):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components).to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        cross_attention_dim = pipe.unet.config.get("cross_attention_dim", 32)
-
-        # forward pass without ip adapter
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
-        output_without_adapter = pipe(**inputs)[0]
-
-        adapter_state_dict_1 = create_ip_adapter_state_dict(pipe.unet)
-        adapter_state_dict_2 = create_ip_adapter_state_dict(pipe.unet)
-        pipe.unet._load_ip_adapter_weights([adapter_state_dict_1, adapter_state_dict_2])
-
-        # forward pass with multi ip adapter, but scale=0 which should have no effect
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
-        inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)] * 2
-        pipe.set_ip_adapter_scale([0.0, 0.0])
-        output_without_multi_adapter_scale = pipe(**inputs)[0]
-
-        # forward pass with multi ip adapter, but with scale of adapter weights
-        inputs = self._modify_inputs_for_ip_adapter_test(self.get_dummy_inputs(torch_device))
-        inputs["ip_adapter_image_embeds"] = [self._get_dummy_image_embeds(cross_attention_dim)] * 2
-        pipe.set_ip_adapter_scale([42.0, 42.0])
-        output_with_multi_adapter_scale = pipe(**inputs)[0]
-
-        max_diff_without_multi_adapter_scale = np.abs(
-            output_without_multi_adapter_scale - output_without_adapter
-        ).max()
-        max_diff_with_multi_adapter_scale = np.abs(output_with_multi_adapter_scale - output_without_adapter).max()
-        self.assertLess(
-            max_diff_without_multi_adapter_scale,
-            expected_max_diff,
-            "Output without multi-ip-adapter must be same as normal inference",
-        )
-        self.assertGreater(
-            max_diff_with_multi_adapter_scale,
-            1e-2,
-            "Output with multi-ip-adapter scale must be different from normal inference",
-        )
-
-
 class PipelineLatentTesterMixin:
    """
    This mixin is designed to be used with PipelineTesterMixin and unittest.TestCase classes.