Merge branch 'main' into test-v

Merge branch 'test-v' of https://github.com/huggingface/diffusers into test-v
remove post quant conv
2023-11-27 13:36:38 +00:00 · 2023-11-27 13:35:36 +00:00 · 2023-11-27 13:27:46 +00:00 · 2023-11-27 14:11:26 +01:00 · 2023-11-27 13:59:30 +01:00 · 2023-11-27 13:55:02 +01:00
151 changed files with 5346 additions and 11912 deletions
@@ -35,15 +35,14 @@ jobs:
    - name: Checkout diffusers
      uses: actions/checkout@v3
      with:
-        fetch-depth: 0
+        fetch-depth: 2
    - name: Install dependencies
      run: |
        apt-get update && apt-get install libsndfile1-dev libgl1 -y
-        python -m pip install -e .[quality,test]
+        python -m pip install -e .
    - name: Environment
      run: |
        python utils/print_env.py
-        echo $(git --version)
    - name: Fetch Tests
      run: |
        python utils/tests_fetcher.py | tee test_preparation.txt
@@ -111,7 +110,7 @@ jobs:
      continue-on-error: true
      run: |
        cat reports/${{ matrix.modules }}_tests_cpu_stats.txt
-        cat reports/${{ matrix.modules }}_tests_cpu_failures_short.txt
+        cat reports/${{ matrix.modules }}_tests_cpu/failures_short.txt

    - name: Test suite reports artifacts
      if: ${{ always() }}
@@ -115,7 +115,7 @@ jobs:
      run: |
        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
          --make-reports=tests_${{ matrix.config.report }} \
-          examples
+          examples/test_examples.py

    - name: Failure short reports
      if: ${{ failure() }}
@@ -100,7 +100,7 @@ jobs:
      run: |
        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
          --make-reports=tests_${{ matrix.config.report }} \
-          examples
+          examples/test_examples.py 

    - name: Failure short reports
      if: ${{ failure() }}
@@ -355,7 +355,7 @@ You will need basic `git` proficiency to be able to contribute to
 manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
 Git](https://git-scm.com/book/en/v2) is a very good reference.

-Follow these steps to start contributing ([supported Python versions](https://github.com/huggingface/diffusers/blob/main/setup.py#L265)):
+Follow these steps to start contributing ([supported Python versions](https://github.com/huggingface/diffusers/blob/main/setup.py#L244)):

 1. Fork the [repository](https://github.com/huggingface/diffusers) by
 clicking on the 'Fork' button on the repository's page. This creates a copy of the code
@@ -41,7 +41,7 @@ repo-consistency:

 quality:
 	ruff check $(check_dirs) setup.py
-	ruff format --check $(check_dirs) setup.py
+	ruff format --check $(check_dirs) setup.py 
 	python utils/check_doc_toc.py

 # Format source code automatically and check is there are any problems left that need manual fixing
@@ -82,7 +82,7 @@ Models are designed as configurable toolboxes that are natural extensions of [Py
 The following design principles are followed:
 - Models correspond to **a type of model architecture**. *E.g.* the [`UNet2DConditionModel`] class is used for all UNet variations that expect 2D image inputs and are conditioned on some context.
 - All models can be found in [`src/diffusers/models`](https://github.com/huggingface/diffusers/tree/main/src/diffusers/models) and every model architecture shall be defined in its file, e.g. [`unet_2d_condition.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_condition.py), [`transformer_2d.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/transformer_2d.py), etc...
- Models **do not** follow the single-file policy and should make use of smaller model building blocks, such as [`attention.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py), [`resnet.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py), [`embeddings.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/embeddings.py), etc... **Note**: This is in stark contrast to Transformers' modeling files and shows that models do not really follow the single-file policy.
+- Models **do not** follow the single-file policy and should make use of smaller model building blocks, such as [`attention.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py), [`resnet.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py), [`embeddings.py`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/embeddings.py), etc... **Note**: This is in stark contrast to Transformers' modelling files and shows that models do not really follow the single-file policy.
 - Models intend to expose complexity, just like PyTorch's `Module` class, and give clear error messages.
 - Models all inherit from `ModelMixin` and `ConfigMixin`.
 - Models can be optimized for performance when it doesn’t demand major code changes, keep backward compatibility, and give significant memory or compute gain.
@@ -72,8 +72,6 @@
      title: Overview
    - local: using-diffusers/sdxl
      title: Stable Diffusion XL
-    - local: using-diffusers/sdxl_turbo
-      title: SDXL Turbo
    - local: using-diffusers/kandinsky
      title: Kandinsky
    - local: using-diffusers/controlnet
@@ -96,8 +94,6 @@
      title: Latent Consistency Model-LoRA
    - local: using-diffusers/inference_with_lcm
      title: Latent Consistency Model
-    - local: using-diffusers/svd
-      title: Stable Video Diffusion
    title: Specific pipeline examples
  - sections:
    - local: training/overview
@@ -133,8 +129,6 @@
        title: LoRA
      - local: training/custom_diffusion
        title: Custom Diffusion
-      - local: training/lcm_distill
-        title: Latent Consistency Distillation
      - local: training/ddpo
        title: Reinforcement learning training with DDPO
      title: Methods
@@ -335,14 +329,12 @@
        title: Stable Diffusion 2
      - local: api/pipelines/stable_diffusion/stable_diffusion_xl
        title: Stable Diffusion XL
-      - local: api/pipelines/stable_diffusion/sdxl_turbo
-        title: SDXL Turbo
      - local: api/pipelines/stable_diffusion/latent_upscale
        title: Latent upscaler
      - local: api/pipelines/stable_diffusion/upscale
        title: Super-resolution
      - local: api/pipelines/stable_diffusion/ldm3d_diffusion
-        title: LDM3D Text-to-(RGB, Depth), Text-to-(RGB-pano, Depth-pano), LDM3D Upscaler
+        title: LDM3D Text-to-(RGB, Depth)
      - local: api/pipelines/stable_diffusion/adapter
        title: Stable Diffusion T2I-Adapter
      - local: api/pipelines/stable_diffusion/gligen
@@ -9,32 +9,7 @@ specific language governing permissions and limitations under the License.

 # Kandinsky 3

-Kandinsky 3 is created by [Vladimir Arkhipkin](https://github.com/oriBetelgeuse),[Anastasia Maltseva](https://github.com/NastyaMittseva),[Igor Pavlov](https://github.com/boomb0om),[Andrei Filatov](https://github.com/anvilarth),[Arseniy Shakhmatov](https://github.com/cene555),[Andrey Kuznetsov](https://github.com/kuznetsoffandrey),[Denis Dimitrov](https://github.com/denndimitrov), [Zein Shaheen](https://github.com/zeinsh)
-
-The description from it's Github page: 
-
-*Kandinsky 3.0 is an open-source text-to-image diffusion model built upon the Kandinsky2-x model family. In comparison to its predecessors, enhancements have been made to the text understanding and visual quality of the model, achieved by increasing the size of the text encoder and Diffusion U-Net models, respectively.*
-
-Its architecture includes 3 main components:
-1. [FLAN-UL2](https://huggingface.co/google/flan-ul2), which is an encoder decoder model based on the T5 architecture. 
-2. New U-Net architecture featuring BigGAN-deep blocks doubles depth while maintaining the same number of parameters.
-3. Sber-MoVQGAN is a decoder proven to have superior results in image restoration.
-
-
-
-The original codebase can be found at [ai-forever/Kandinsky-3](https://github.com/ai-forever/Kandinsky-3).
-
-<Tip>
-
-Check out the [Kandinsky Community](https://huggingface.co/kandinsky-community) organization on the Hub for the official model checkpoints for tasks like text-to-image, image-to-image, and inpainting.
-
-</Tip>
-
-<Tip>
-
-Make sure to check out the schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
+TODO

 ## Kandinsky3Pipeline

@@ -51,10 +51,9 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [InstructPix2Pix](pix2pix) | image editing |
 | [Kandinsky 2.1](kandinsky) | text2image, image2image, inpainting, interpolation |
 | [Kandinsky 2.2](kandinsky_v22) | text2image, image2image, inpainting |
-| [Kandinsky 3](kandinsky3) | text2image, image2image |
 | [Latent Consistency Models](latent_consistency_models) | text2image |
 | [Latent Diffusion](latent_diffusion) | text2image, super-resolution |
-| [LDM3D](stable_diffusion/ldm3d_diffusion) | text2image, text-to-3D, text-to-pano, upscaling |
+| [LDM3D](stable_diffusion/ldm3d_diffusion) | text2image, text-to-3D |
 | [MultiDiffusion](panorama) | text2image |
 | [MusicLDM](musicldm) | text2audio |
 | [Paint by Example](paint_by_example) | inpainting |
@@ -14,11 +14,6 @@ specific language governing permissions and limitations under the License.

 LDM3D was proposed in [LDM3D: Latent Diffusion Model for 3D](https://huggingface.co/papers/2305.10853) by Gabriela Ben Melech Stan, Diana Wofk, Scottie Fox, Alex Redden, Will Saxton, Jean Yu, Estelle Aflalo, Shao-Yen Tseng, Fabio Nonato, Matthias Muller, and Vasudev Lal. LDM3D generates an image and a depth map from a given text prompt unlike the existing text-to-image diffusion models such as [Stable Diffusion](./overview) which only generates an image. With almost the same number of parameters, LDM3D achieves to create a latent space that can compress both the RGB images and the depth maps. 

-Two checkpoints are available for use:
- [ldm3d-original](https://huggingface.co/Intel/ldm3d). The original checkpoint used in the [paper](https://arxiv.org/pdf/2305.10853.pdf)
- [ldm3d-4c](https://huggingface.co/Intel/ldm3d-4c). The new version of LDM3D using 4 channels inputs instead of 6-channels inputs and finetuned on higher resolution images. 
-
-
 The abstract from the paper is:

 *This research paper proposes a Latent Diffusion Model for 3D (LDM3D) that generates both image and depth map data from a given text prompt, allowing users to generate RGBD images from text prompts. The LDM3D model is fine-tuned on a dataset of tuples containing an RGB image, depth map and caption, and validated through extensive experiments. We also develop an application called DepthFusion, which uses the generated RGB images and depth maps to create immersive and interactive 360-degree-view experiences using TouchDesigner. This technology has the potential to transform a wide range of industries, from entertainment and gaming to architecture and design. Overall, this paper presents a significant contribution to the field of generative AI and computer vision, and showcases the potential of LDM3D and DepthFusion to revolutionize content creation and digital experiences. A short video summarizing the approach can be found at [this url](https://t.ly/tdi2).*
@@ -31,25 +26,12 @@ Make sure to check out the Stable Diffusion [Tips](overview#tips) section to lea

 ## StableDiffusionLDM3DPipeline

-[[autodoc]] pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d.StableDiffusionLDM3DPipeline
+[[autodoc]] StableDiffusionLDM3DPipeline
 	- all
 	- __call__

-
 ## LDM3DPipelineOutput

 [[autodoc]] pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d.LDM3DPipelineOutput
 	- all
 	- __call__
-
-# Upscaler
-
-[LDM3D-VR](https://arxiv.org/pdf/2311.03226.pdf) is an extended version of LDM3D. 
-
-The abstract from the paper is:
-*Latent diffusion models have proven to be state-of-the-art in the creation and manipulation of visual outputs. However, as far as we know, the generation of depth maps jointly with RGB is still limited. We introduce LDM3D-VR, a suite of diffusion models targeting virtual reality development that includes LDM3D-pano and LDM3D-SR. These models enable the generation of panoramic RGBD based on textual prompts and the upscaling of low-resolution inputs to high-resolution RGBD, respectively. Our models are fine-tuned from existing pretrained models on datasets containing panoramic/high-resolution RGB images, depth maps and captions. Both models are evaluated in comparison to existing related methods*
-
-Two checkpoints are available for use:
- [ldm3d-pano](https://huggingface.co/Intel/ldm3d-pano). This checkpoint enables the generation of panoramic images and requires the StableDiffusionLDM3DPipeline pipeline to be used.
- [ldm3d-sr](https://huggingface.co/Intel/ldm3d-sr). This checkpoint enables the upscaling of RGB and depth images. Can be used in cascade after the original LDM3D pipeline using the StableDiffusionUpscaleLDM3DPipeline from communauty pipeline.
-
@@ -121,16 +121,10 @@ The table below summarizes the available Stable Diffusion pipelines, their suppo
            <td class="px-4 py-2 text-gray-700">
            <a href="./ldm3d_diffusion">StableDiffusionLDM3D</a>
            </td>
-            <td class="px-4 py-2 text-gray-700">text-to-rgb, text-to-depth, text-to-pano</td>
+            <td class="px-4 py-2 text-gray-700">text-to-rgb, text-to-depth</td>
            <td class="px-4 py-2"><a href="https://huggingface.co/spaces/r23/ldm3d-space"><img src="https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Spaces-blue"/></a>
            </td>
        </tr>
-        <tr>
-            <td class="px-4 py-2 text-gray-700">
-            <a href="./ldm3d_diffusion">StableDiffusionUpscaleLDM3D</a>
-            </td>
-            <td class="px-4 py-2 text-gray-700">ldm3d super-resolution</td>
-        </tr>
        </tbody>
    </table>
    </div>
@@ -1,53 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# SDXL Turbo
-
-Stable Diffusion XL (SDXL) Turbo was proposed in [Adversarial Diffusion Distillation](https://stability.ai/research/adversarial-diffusion-distillation) by Axel Sauer, Dominik Lorenz, Andreas Blattmann, and Robin Rombach.
-
-The abstract from the paper is:
-
-*We introduce Adversarial Diffusion Distillation (ADD), a novel training approach that efficiently samples large-scale foundational image diffusion models in just 1–4 steps while maintaining high image quality. We use score distillation to leverage large-scale off-the-shelf image diffusion models as a teacher signal in combination with an adversarial loss to ensure high image fidelity even in the low-step regime of one or two sampling steps. Our analyses show that our model clearly outperforms existing few-step methods (GANs,Latent Consistency Models) in a single step and reaches the performance of state-of-the-art diffusion models (SDXL) in only four steps. ADD is the first method to unlock single-step, real-time image synthesis with foundation models.*
-
-## Tips
-
- SDXL Turbo uses the exact same architecture as [SDXL](./stable_diffusion_xl).
- SDXL Turbo should disable guidance scale by setting `guidance_scale=0.0`
- SDXL Turbo should use `timestep_spacing='trailing'` for the scheduler and use between 1 and 4 steps.
- SDXL Turbo has been trained to generate images of size 512x512.
- SDXL Turbo is open-access, but not open-source meaning that one might have to buy a model license in order to use it for commercial applications. Make sure to read the [official model card](https://huggingface.co/stabilityai/sdxl-turbo) to learn more.
-
-<Tip>
-
-To learn how to use SDXL Turbo for various tasks, how to optimize performance, and other usage examples, take a look at the [Stable Diffusion XL](../../../using-diffusers/sdxl_turbo) guide.
-
-Check out the [Stability AI](https://huggingface.co/stabilityai) Hub organization for the official base and refiner model checkpoints!
-
-</Tip>
-
-## StableDiffusionXLPipeline
-
-[[autodoc]] StableDiffusionXLPipeline
-	- all
-	- __call__
-
-## StableDiffusionXLImg2ImgPipeline
-
-[[autodoc]] StableDiffusionXLImg2ImgPipeline
-	- all
-	- __call__
-
-## StableDiffusionXLInpaintPipeline
-
-[[autodoc]] StableDiffusionXLInpaintPipeline
-	- all
-	- __call__
@@ -92,19 +92,6 @@ imageio.mimsave("video.mp4", result, fps=4)
 ```


- #### SDXL Support
-In order to use the SDXL model when generating a video from prompt, use the `TextToVideoZeroSDXLPipeline` pipeline:
-
-```python
-import torch
-from diffusers import TextToVideoZeroSDXLPipeline
-
-model_id = "stabilityai/stable-diffusion-xl-base-1.0"
-pipe = TextToVideoZeroSDXLPipeline.from_pretrained(
-    model_id, torch_dtype=torch.float16, variant="fp16", use_safetensors=True
-).to("cuda")
-```
-
 ### Text-To-Video with Pose Control
 To generate a video from prompt with additional pose control

@@ -154,33 +141,7 @@ To generate a video from prompt with additional pose control
    result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images
    imageio.mimsave("video.mp4", result, fps=4)
    ```
- #### SDXL Support
-	
-	Since our attention processor also works with SDXL, it can be utilized to generate a video from prompt using ControlNet models powered by SDXL:
-	```python
-	import torch
-	from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel
-	from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor
-	
-	controlnet_model_id = 'thibaud/controlnet-openpose-sdxl-1.0'
-	model_id = 'stabilityai/stable-diffusion-xl-base-1.0'
-	
-	controlnet = ControlNetModel.from_pretrained(controlnet_model_id, torch_dtype=torch.float16)
-	pipe = StableDiffusionControlNetPipeline.from_pretrained(
-		model_id, controlnet=controlnet, torch_dtype=torch.float16
-	).to('cuda')
-	
-	# Set the attention processor
-	pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
-	pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))
-	
-	# fix latents for all frames
-	latents = torch.randn((1, 4, 128, 128), device="cuda", dtype=torch.float16).repeat(len(pose_images), 1, 1, 1)
-	
-	prompt = "Darth Vader dancing in a desert"
-	result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images
-	imageio.mimsave("video.mp4", result, fps=4)
-	```
+

 ### Text-To-Video with Edge Control

@@ -292,10 +253,5 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
 	- all
 	- __call__

-## TextToVideoZeroSDXLPipeline
-[[autodoc]] TextToVideoZeroSDXLPipeline
-	- all
-	- __call__
-
 ## TextToVideoPipelineOutput
 [[autodoc]] pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.TextToVideoPipelineOutput
@@ -24,7 +24,7 @@ The abstract from the paper is:

 *Model-based reinforcement learning methods often use learning only for the purpose of estimating an approximate dynamics model, offloading the rest of the decision-making work to classical trajectory optimizers. While conceptually simple, this combination has a number of empirical shortcomings, suggesting that learned models may not be well-suited to standard trajectory optimization. In this paper, we consider what it would look like to fold as much of the trajectory optimization pipeline as possible into the modeling problem, such that sampling from the model and planning with it become nearly identical. The core of our technical approach lies in a diffusion probabilistic model that plans by iteratively denoising trajectories. We show how classifier-guided sampling and image inpainting can be reinterpreted as coherent planning strategies, explore the unusual and useful properties of diffusion-based planning methods, and demonstrate the effectiveness of our framework in control settings that emphasize long-horizon decision-making and test-time flexibility.*

-You can find additional information about the model on the [project page](https://diffusion-planning.github.io/), the [original codebase](https://github.com/jannerm/diffuser), or try it out in a demo [notebook](https://colab.research.google.com/drive/1rXm8CX4ZdN5qivjJ2lhwhkOmt_m0CvU0#scrollTo=6HXJvhyqcITc&uniqifier=1).
+You can find additional information about the model on the [project page](https://diffusion-planning.github.io/), the [original codebase](https://github.com/jannerm/diffuser), or try it out in a demo [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/reinforcement_learning_with_diffusers.ipynb).

 The script to run the model is available [here](https://github.com/huggingface/diffusers/tree/main/examples/reinforcement_learning).

@@ -297,37 +297,17 @@ if you don't know yet what specific component you would like to add:
 - [Model or pipeline](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+pipeline%2Fmodel%22)
 - [Scheduler](https://github.com/huggingface/diffusers/issues?q=is%3Aopen+is%3Aissue+label%3A%22New+scheduler%22)

-Before adding any of the three components, it is strongly recommended that you give the [Philosophy guide](philosophy) a read to better understand the design of any of the three components. Please be aware that we cannot merge model, scheduler, or pipeline additions that strongly diverge from our design philosophy
-as it will lead to API inconsistencies. If you fundamentally disagree with a design choice, please open a [Feedback issue](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=) instead so that it can be discussed whether a certain design pattern/design choice shall be changed everywhere in the library and whether we shall update our design philosophy. Consistency across the library is very important for us.
+Before adding any of the three components, it is strongly recommended that you give the [Philosophy guide](philosophy) a read to better understand the design of any of the three components. Please be aware that
+we cannot merge model, scheduler, or pipeline additions that strongly diverge from our design philosophy
+as it will lead to API inconsistencies. If you fundamentally disagree with a design choice, please
+open a [Feedback issue](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feedback.md&title=) instead so that it can be discussed whether a certain design
+pattern/design choice shall be changed everywhere in the library and whether we shall update our design philosophy. Consistency across the library is very important for us.

-Please make sure to add links to the original codebase/paper to the PR and ideally also ping the original author directly on the PR so that they can follow the progress and potentially help with questions.
+Please make sure to add links to the original codebase/paper to the PR and ideally also ping the
+original author directly on the PR so that they can follow the progress and potentially help with questions.

 If you are unsure or stuck in the PR, don't hesitate to leave a message to ask for a first review or help.

-#### Copied from mechanism
-
-A unique and important feature to understand when adding any pipeline, model or scheduler code is the `# Copied from` mechanism. You'll see this all over the Diffusers codebase, and the reason we use it is to keep the codebase easy to understand and maintain. Marking code with the `# Copied from` mechanism forces the marked code to be identical to the code it was copied from. This makes it easy to update and propagate changes across many files whenever you run `make fix-copies`.
-
-For example, in the code example below, [`~diffusers.pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is the original code and `AltDiffusionPipelineOutput` uses the `# Copied from` mechanism to copy it. The only difference is changing the class prefix from `Stable` to `Alt`.
-
-```py
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_output.StableDiffusionPipelineOutput with Stable->Alt
-class AltDiffusionPipelineOutput(BaseOutput):
-    """
-    Output class for Alt Diffusion pipelines.
-
-    Args:
-        images (`List[PIL.Image.Image]` or `np.ndarray`)
-            List of denoised PIL images of length `batch_size` or NumPy array of shape `(batch_size, height, width,
-            num_channels)`.
-        nsfw_content_detected (`List[bool]`)
-            List indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content or
-            `None` if safety checking could not be performed.
-    """
-```
-
-To learn more, read this section of the [~Don't~ Repeat Yourself*](https://huggingface.co/blog/transformers-design-philosophy#4-machine-learning-models-are-static) blog post.
-
 ## How to write a good issue

 **The better your issue is written, the higher the chances that it will be quickly resolved.**
@@ -1,255 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Latent Consistency Distillation
-
-[Latent Consistency Models (LCMs)](https://hf.co/papers/2310.04378) are able to generate high-quality images in just a few steps, representing a big leap forward because many pipelines require at least 25+ steps. LCMs are produced by applying the latent consistency distillation method to any Stable Diffusion model. This method works by applying *one-stage guided distillation* to the latent space, and incorporating a *skipping-step* method to consistently skip timesteps to accelerate the distillation process (refer to section 4.1, 4.2, and 4.3 of the paper for more details).
-
-If you're training on a GPU with limited vRAM, try enabling `gradient_checkpointing`, `gradient_accumulation_steps`, and `mixed_precision` to reduce memory-usage and speedup training. You can reduce your memory-usage even more by enabling memory-efficient attention with [xFormers](../optimization/xformers) and [bitsandbytes'](https://github.com/TimDettmers/bitsandbytes) 8-bit optimizer.
-
-This guide will explore the [train_lcm_distill_sd_wds.py](https://github.com/huggingface/diffusers/blob/main/examples/consistency_distillation/train_lcm_distill_sd_wds.py) script to help you become more familiar with it, and how you can adapt it for your own use-case.
-
-Before running the script, make sure you install the library from source:
-
-```bash
-git clone https://github.com/huggingface/diffusers
-cd diffusers
-pip install .
-```
-
-Then navigate to the example folder containing the training script and install the required dependencies for the script you're using:
-
-```bash
-cd examples/consistency_distillation
-pip install -r requirements.txt
-```
-
-<Tip>
-
-🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
-
-</Tip>
-
-Initialize an 🤗 Accelerate environment (try enabling `torch.compile` to significantly speedup training):
-
-```bash
-accelerate config
-```
-
-To setup a default 🤗 Accelerate environment without choosing any configurations:
-
-```bash
-accelerate config default
-```
-
-Or if your environment doesn't support an interactive shell, like a notebook, you can use:
-
-```bash
-from accelerate.utils import write_basic_config
-
-write_basic_config()
-```
-
-Lastly, if you want to train a model on your own dataset, take a look at the [Create a dataset for training](create_dataset) guide to learn how to create a dataset that works with the training script.
-
-## Script parameters
-
-<Tip>
-
-The following sections highlight parts of the training script that are important for understanding how to modify it, but it doesn't cover every aspect of the script in detail. If you're interested in learning more, feel free to read through the [script](https://github.com/huggingface/diffusers/blob/main/examples/consistency_distillation/train_lcm_distill_sd_wds.py) and let us know if you have any questions or concerns.
-
-</Tip>
-
-The training script provides many parameters to help you customize your training run. All of the parameters and their descriptions are found in the [`parse_args()`](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L419) function. This function provides default values for each parameter, such as the training batch size and learning rate, but you can also set your own values in the training command if you'd like.
-
-For example, to speedup training with mixed precision using the fp16 format, add the `--mixed_precision` parameter to the training command:
-
-```bash
-accelerate launch train_lcm_distill_sd_wds.py \
-  --mixed_precision="fp16"
-```
-
-Most of the parameters are identical to the parameters in the [Text-to-image](text2image#script-parameters) training guide, so you'll focus on the parameters that are relevant to latent consistency distillation in this guide.
-
- `--pretrained_teacher_model`: the path to a pretrained latent diffusion model to use as the teacher model
- `--pretrained_vae_model_name_or_path`: path to a pretrained VAE; the SDXL VAE is known to suffer from numerical instability, so this parameter allows you to specify an alternative VAE (like this [VAE]((https://huggingface.co/madebyollin/sdxl-vae-fp16-fix)) by madebyollin which works in fp16)
- `--w_min` and `--w_max`: the minimum and maximum guidance scale values for guidance scale sampling
- `--num_ddim_timesteps`: the number of timesteps for DDIM sampling
- `--loss_type`: the type of loss (L2 or Huber) to calculate for latent consistency distillation; Huber loss is generally preferred because it's more robust to outliers
- `--huber_c`: the Huber loss parameter
-
-## Training script
-
-The training script starts by creating a dataset class - [`Text2ImageDataset`](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L141) - for preprocessing the images and creating a training dataset.
-
-```py
-def transform(example):
-    image = example["image"]
-    image = TF.resize(image, resolution, interpolation=transforms.InterpolationMode.BILINEAR)
-
-    c_top, c_left, _, _ = transforms.RandomCrop.get_params(image, output_size=(resolution, resolution))
-    image = TF.crop(image, c_top, c_left, resolution, resolution)
-    image = TF.to_tensor(image)
-    image = TF.normalize(image, [0.5], [0.5])
-
-    example["image"] = image
-    return example
-```
-
-For improved performance on reading and writing large datasets stored in the cloud, this script uses the [WebDataset](https://github.com/webdataset/webdataset) format to create a preprocessing pipeline to apply transforms and create a dataset and dataloader for training. Images are processed and fed to the training loop without having to download the full dataset first.
-
-```py
-processing_pipeline = [
-    wds.decode("pil", handler=wds.ignore_and_continue),
-    wds.rename(image="jpg;png;jpeg;webp", text="text;txt;caption", handler=wds.warn_and_continue),
-    wds.map(filter_keys({"image", "text"})),
-    wds.map(transform),
-    wds.to_tuple("image", "text"),
-]
-```
-
-In the [`main()`](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L768) function, all the necessary components like the noise scheduler, tokenizers, text encoders, and VAE are loaded. The teacher UNet is also loaded here and then you can create a student UNet from the teacher UNet. The student UNet is updated by the optimizer during training.
-
-```py
-teacher_unet = UNet2DConditionModel.from_pretrained(
-    args.pretrained_teacher_model, subfolder="unet", revision=args.teacher_revision
-)
-
-unet = UNet2DConditionModel(**teacher_unet.config)
-unet.load_state_dict(teacher_unet.state_dict(), strict=False)
-unet.train()
-```
-
-Now you can create the [optimizer](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L979) to update the UNet parameters:
-
-```py
-optimizer = optimizer_class(
-    unet.parameters(),
-    lr=args.learning_rate,
-    betas=(args.adam_beta1, args.adam_beta2),
-    weight_decay=args.adam_weight_decay,
-    eps=args.adam_epsilon,
-)
-```
-
-Create the [dataset](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L994):
-
-```py
-dataset = Text2ImageDataset(
-    train_shards_path_or_url=args.train_shards_path_or_url,
-    num_train_examples=args.max_train_samples,
-    per_gpu_batch_size=args.train_batch_size,
-    global_batch_size=args.train_batch_size * accelerator.num_processes,
-    num_workers=args.dataloader_num_workers,
-    resolution=args.resolution,
-    shuffle_buffer_size=1000,
-    pin_memory=True,
-    persistent_workers=True,
-)
-train_dataloader = dataset.train_dataloader
-```
-
-Next, you're ready to setup the [training loop](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L1049) and implement the latent consistency distillation method (see Algorithm 1 in the paper for more details). This section of the script takes care of adding noise to the latents, sampling and creating a guidance scale embedding, and predicting the original image from the noise.
-
-```py
-pred_x_0 = predicted_origin(
-    noise_pred,
-    start_timesteps,
-    noisy_model_input,
-    noise_scheduler.config.prediction_type,
-    alpha_schedule,
-    sigma_schedule,
-)
-
-model_pred = c_skip_start * noisy_model_input + c_out_start * pred_x_0
-```
-
-It gets the [teacher model predictions](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L1172) and the [LCM predictions](https://github.com/huggingface/diffusers/blob/3b37488fa3280aed6a95de044d7a42ffdcb565ef/examples/consistency_distillation/train_lcm_distill_sd_wds.py#L1209) next, calculates the loss, and then backpropagates it to the LCM.
-
-```py
-if args.loss_type == "l2":
-    loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
-elif args.loss_type == "huber":
-    loss = torch.mean(
-        torch.sqrt((model_pred.float() - target.float()) ** 2 + args.huber_c**2) - args.huber_c
-    )
-```
-
-If you want to learn more about how the training loop works, check out the [Understanding pipelines, models and schedulers tutorial](../using-diffusers/write_own_pipeline) which breaks down the basic pattern of the denoising process.
-
-## Launch the script
-
-Now you're ready to launch the training script and start distilling!
-
-For this guide, you'll use the `--train_shards_path_or_url` to specify the path to the [Conceptual Captions 12M](https://github.com/google-research-datasets/conceptual-12m) dataset stored on the Hub [here](https://huggingface.co/datasets/laion/conceptual-captions-12m-webdataset). Set the `MODEL_DIR` environment variable to the name of the teacher model and `OUTPUT_DIR` to where you want to save the model.
-
-```bash
-export MODEL_DIR="runwayml/stable-diffusion-v1-5"
-export OUTPUT_DIR="path/to/saved/model"
-
-accelerate launch train_lcm_distill_sd_wds.py \
-    --pretrained_teacher_model=$MODEL_DIR \
-    --output_dir=$OUTPUT_DIR \
-    --mixed_precision=fp16 \
-    --resolution=512 \
-    --learning_rate=1e-6 --loss_type="huber" --ema_decay=0.95 --adam_weight_decay=0.0 \
-    --max_train_steps=1000 \
-    --max_train_samples=4000000 \
-    --dataloader_num_workers=8 \
-    --train_shards_path_or_url="pipe:curl -L -s https://huggingface.co/datasets/laion/conceptual-captions-12m-webdataset/resolve/main/data/{00000..01099}.tar?download=true" \
-    --validation_steps=200 \
-    --checkpointing_steps=200 --checkpoints_total_limit=10 \
-    --train_batch_size=12 \
-    --gradient_checkpointing --enable_xformers_memory_efficient_attention \
-    --gradient_accumulation_steps=1 \
-    --use_8bit_adam \
-    --resume_from_checkpoint=latest \
-    --report_to=wandb \
-    --seed=453645634 \
-    --push_to_hub
-```
-
-Once training is complete, you can use your new LCM for inference.
-
-```py
-from diffusers import UNet2DConditionModel, DiffusionPipeline, LCMScheduler
-import torch
-
-unet = UNet2DConditionModel.from_pretrained("your-username/your-model", torch_dtype=torch.float16, variant="fp16")
-pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", unet=unet, torch_dtype=torch.float16, variant="fp16")
-
-pipeline.scheduler = LCMScheduler.from_config(pipe.scheduler.config)
-pipeline.to("cuda")
-
-prompt = "sushi rolls in the form of panda heads, sushi platter"
-
-image = pipeline(prompt, num_inference_steps=4, guidance_scale=1.0).images[0]
-```
-
-## LoRA
-
-LoRA is a training technique for significantly reducing the number of trainable parameters. As a result, training is faster and it is easier to store the resulting weights because they are a lot smaller (~100MBs). Use the [train_lcm_distill_lora_sd_wds.py](https://github.com/huggingface/diffusers/blob/main/examples/consistency_distillation/train_lcm_distill_lora_sd_wds.py) or [train_lcm_distill_lora_sdxl.wds.py](https://github.com/huggingface/diffusers/blob/main/examples/consistency_distillation/train_lcm_distill_lora_sdxl_wds.py) script to train with LoRA.
-
-The LoRA training script is discussed in more detail in the [LoRA training](lora) guide.
-
-## Stable Diffusion XL
-
-Stable Diffusion XL (SDXL) is a powerful text-to-image model that generates high-resolution images, and it adds a second text-encoder to its architecture. Use the [train_lcm_distill_sdxl_wds.py](https://github.com/huggingface/diffusers/blob/main/examples/consistency_distillation/train_lcm_distill_sdxl_wds.py) script to train a SDXL model with LoRA.
-
-The SDXL training script is discussed in more detail in the [SDXL training](sdxl) guide.
-
-## Next steps
-
-Congratulations on distilling a LCM model! To learn more about LCM, the following may be helpful:
-
- Learn how to use [LCMs for inference](../using-diffusers/lcm) for text-to-image, image-to-image, and with LoRA checkpoints.
- Read the [SDXL in 4 steps with Latent Consistency LoRAs](https://huggingface.co/blog/lcm_lora) blog post to learn more about SDXL LCM-LoRA's for super fast inference, quality comparisons, benchmarks, and more.
@@ -20,8 +20,6 @@ The Kandinsky models are a series of multilingual text-to-image generation model

 [Kandinsky 2.2](../api/pipelines/kandinsky_v22) improves on the previous model by replacing the image encoder of the image prior model with a larger CLIP-ViT-G model to improve quality. The image prior model was also retrained on images with different resolutions and aspect ratios to generate higher-resolution images and different image sizes.

-[Kandinsky 3](../api/pipelines/kandinsky3) simplifies the architecture and shifts away from the two-stage generation process involving the prior model and diffusion model. Instead, Kandinsky 3 uses [Flan-UL2](https://huggingface.co/google/flan-ul2) to encode text, a UNet with [BigGan-deep](https://hf.co/papers/1809.11096) blocks, and [Sber-MoVQGAN](https://github.com/ai-forever/MoVQGAN) to decode the latents into images. Text understanding and generated image quality are primarily achieved by using a larger text encoder and UNet.
-
 This guide will show you how to use the Kandinsky models for text-to-image, image-to-image, inpainting, interpolation, and more.

 Before you begin, make sure you have the following libraries installed:
@@ -35,10 +33,6 @@ Before you begin, make sure you have the following libraries installed:

 Kandinsky 2.1 and 2.2 usage is very similar! The only difference is Kandinsky 2.2 doesn't accept `prompt` as an input when decoding the latents. Instead, Kandinsky 2.2 only accepts `image_embeds` during decoding.

-<br>
-
-Kandinsky 3 has a more concise architecture and it doesn't require a prior model. This means it's usage is identical to other diffusion models like [Stable Diffusion XL](sdxl).
-
 </Tip>

 ## Text-to-image
@@ -97,23 +91,6 @@ image
    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-text-to-image.png"/>
 </div>

-</hfoption>
-<hfoption id="Kandinsky 3">
-
-Kandinsky 3 doesn't require a prior model so you can directly load the [`Kandinsky3Pipeline`] and pass a prompt to generate an image:
-
-```py
-from diffusers import Kandinsky3Pipeline
-import torch
-
-pipeline = Kandinsky3Pipeline.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16)
-pipeline.enable_model_cpu_offload()
-
-prompt = "A alien cheeseburger creature eating itself, claymation, cinematic, moody lighting"
-image = pipeline(prompt).images[0]
-image
-```
-
 </hfoption>
 </hfoptions>

@@ -184,20 +161,6 @@ prior_pipeline = KandinskyPriorPipeline.from_pretrained("kandinsky-community/kan
 pipeline = KandinskyV22Img2ImgPipeline.from_pretrained("kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16, use_safetensors=True).to("cuda")
 ```

-</hfoption>
-<hfoption id="Kandinsky 3">
-
-Kandinsky 3 doesn't require a prior model so you can directly load the image-to-image pipeline:
-
-```py
-from diffusers import Kandinsky3Img2ImgPipeline
-from diffusers.utils import load_image
-import torch
-
-pipeline = Kandinsky3Img2ImgPipeline.from_pretrained("kandinsky-community/kandinsky-3", variant="fp16", torch_dtype=torch.float16)
-pipeline.enable_model_cpu_offload()
-```
-
 </hfoption>
 </hfoptions>

@@ -255,14 +218,6 @@ make_image_grid([original_image.resize((512, 512)), image.resize((512, 512))], r
    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/kandinsky-image-to-image.png"/>
 </div>

-</hfoption>
-<hfoption id="Kandinsky 3">
-
-```py
-image = pipeline(prompt, negative_prompt=negative_prompt, image=image, strength=0.75, num_inference_steps=25).images[0]
-image
-```
-
 </hfoption>
 </hfoptions>

@@ -1,116 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Stable Diffusion XL Turbo
-
-[[open-in-colab]]
-
-SDXL Turbo is an adversarial time-distilled [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) (SDXL) model capable
-of running inference in as little as 1 step.
-
-This guide will show you how to use SDXL-Turbo for text-to-image and image-to-image.
-
-Before you begin, make sure you have the following libraries installed:
-
-```py
-# uncomment to install the necessary libraries in Colab
-#!pip install -q diffusers transformers accelerate omegaconf
-```
-
-## Load model checkpoints
-
-Model weights may be stored in separate subfolders on the Hub or locally, in which case, you should use the [`~StableDiffusionXLPipeline.from_pretrained`] method:
-
-```py
-from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image
-import torch
-
-pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16")
-pipeline = pipeline.to("cuda")
-```
-
-You can also use the [`~StableDiffusionXLPipeline.from_single_file`] method to load a model checkpoint stored in a single file format (`.ckpt` or `.safetensors`) from the Hub or locally:
-
-```py
-from diffusers import StableDiffusionXLPipeline
-import torch
-
-pipeline = StableDiffusionXLPipeline.from_single_file(
-    "https://huggingface.co/stabilityai/sdxl-turbo/blob/main/sd_xl_turbo_1.0_fp16.safetensors", torch_dtype=torch.float16)
-pipeline = pipeline.to("cuda")
-```
-
-## Text-to-image
-
-For text-to-image, pass a text prompt. By default, SDXL Turbo generates a 512x512 image, and that resolution gives the best results. You can try setting the `height` and `width` parameters to 768x768 or 1024x1024, but you should expect quality degradations when doing so.
-
-Make sure to set `guidance_scale` to 0.0 to disable, as the model was trained without it. A single inference step is enough to generate high quality images. 
-Increasing the number of steps to 2, 3 or 4 should improve image quality.
-
-```py
-from diffusers import AutoPipelineForText2Image
-import torch
-
-pipeline_text2image = AutoPipelineForText2Image.from_pretrained("stabilityai/sdxl-turbo", torch_dtype=torch.float16, variant="fp16")
-pipeline_text2image = pipeline_text2image.to("cuda")
-
-prompt = "A cinematic shot of a baby racoon wearing an intricate italian priest robe."
-
-image = pipeline_text2image(prompt=prompt, guidance_scale=0.0, num_inference_steps=1).images[0]
-image
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/sdxl-turbo-text2img.png" alt="generated image of a racoon in a robe"/>
-</div>
-
-## Image-to-image
-
-For image-to-image generation, make sure that `num_inference_steps * strength` is larger or equal to 1. 
-The image-to-image pipeline will run for `int(num_inference_steps * strength)` steps, e.g. `0.5 * 2.0 = 1` step in
-our example below.
-
-```py
-from diffusers import AutoPipelineForImage2Image
-from diffusers.utils import load_image, make_image_grid
-
-# use from_pipe to avoid consuming additional memory when loading a checkpoint
-pipeline = AutoPipelineForImage2Image.from_pipe(pipeline_text2image).to("cuda")
-
-init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png")
-init_image = init_image.resize((512, 512))
-
-prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k"
-
-image = pipeline(prompt, image=init_image, strength=0.5, guidance_scale=0.0, num_inference_steps=2).images[0]
-make_image_grid([init_image, image], rows=1, cols=2)
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/sdxl-turbo-img2img.png" alt="Image-to-image generation sample using SDXL Turbo"/>
-</div>
-
-## Speed-up SDXL Turbo even more
-
- Compile the UNet if you are using PyTorch version 2 or better. The first inference run will be very slow, but subsequent ones will be much faster.
-
-```py
-pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-```
-
- When using the default VAE, keep it in `float32` to avoid costly `dtype` conversions before and after each generation. You only need to do this one before your first generation:
-
-```py
-pipe.upcast_vae()
-```
-
-As an alternative, you can also use a [16-bit VAE](https://huggingface.co/madebyollin/sdxl-vae-fp16-fix) created by community member [`@madebyollin`](https://huggingface.co/madebyollin) that does not need to be upcasted to `float32`.
@@ -1,134 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# Stable Video Diffusion
-
-[[open-in-colab]]
-
-[Stable Video Diffusion](https://static1.squarespace.com/static/6213c340453c3f502425776e/t/655ce779b9d47d342a93c890/1700587395994/stable_video_diffusion.pdf) is a powerful image-to-video generation model that can generate high resolution (576x1024) 2-4 second videos conditioned on the input image.
-
-This guide will show you how to use SVD to short generate videos from images.
-
-Before you begin, make sure you have the following libraries installed:
-
-```py
-!pip install -q -U diffusers transformers accelerate 
-```
-
-## Image to Video Generation
-
-The are two variants of SVD. [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) 
-and [SVD-XT](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt). The svd checkpoint is trained to generate 14 frames and the svd-xt checkpoint is further 
-finetuned to generate 25 frames.
-
-We will use the `svd-xt` checkpoint for this guide.
-
-```python
-import torch
-
-from diffusers import StableVideoDiffusionPipeline
-from diffusers.utils import load_image, export_to_video
-
-pipe = StableVideoDiffusionPipeline.from_pretrained(
-    "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
-)
-pipe.enable_model_cpu_offload()
-
-# Load the conditioning image
-image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png?download=true")
-image = image.resize((1024, 576))
-
-generator = torch.manual_seed(42)
-frames = pipe(image, decode_chunk_size=8, generator=generator).frames[0]
-
-export_to_video(frames, "generated.mp4", fps=7)
-```
-
-<video controls width="1024" height="576">
-  <source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket_generated.webm" type="video/webm" />
-  <source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket_generated.mp4" type="video/mp4" />
-</video>
-
-<Tip>
-Since generating videos is more memory intensive we can use the `decode_chunk_size` argument to control how many frames are decoded at once. This will reduce the memory usage. It's recommended to tweak this value based on your GPU memory.
-Setting `decode_chunk_size=1` will decode one frame at a time and will use the least amount of memory but the video might have some flickering.
-
-Additionally, we also use [model cpu offloading](../../optimization/memory#model-offloading) to reduce the memory usage.
-</Tip>
-
-
-### Torch.compile
-
-You can achieve a 20-25% speed-up at the expense of slightly increased memory by compiling the UNet as follows:
-
-```diff
- pipe.enable_model_cpu_offload()
-+ pipe.to("cuda")
-+ pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
-```
-
-### Low-memory
-
-Video generation is very memory intensive as we have to essentially generate `num_frames` all at once. The mechanism is very comparable to text-to-image generation with a high batch size. To reduce the memory requirement you have multiple options. The following options trade inference speed against lower memory requirement:
- enable model offloading: Each component of the pipeline is offloaded to CPU once it's not needed anymore.
- enable feed-forward chunking: The feed-forward layer runs in a loop instead of running with a single huge feed-forward batch size
- reduce `decode_chunk_size`: This means that the VAE decodes frames in chunks instead of decoding them all together. **Note**: In addition to leading to a small slowdown, this method also slightly leads to video quality deterioration
-
-You can enable them as follows:
-
-```diff
-pipe.enable_model_cpu_offload()
-frames = pipe(image, decode_chunk_size=8, generator=generator).frames[0]
-+pipe.enable_model_cpu_offload()
-+pipe.unet.enable_forward_chunking()
-+frames = pipe(image, decode_chunk_size=2, generator=generator, num_frames=25).frames[0]
-```
-
-
-Including all these tricks should lower the memory requirement to less than 8GB VRAM.
-
-### Micro-conditioning
-
-Along with conditioning image Stable Diffusion Video also allows providing micro-conditioning that allows more control over the generated video.
-It accepts the following arguments:
-
- `fps`: The frames per second of the generated video.
- `motion_bucket_id`: The motion bucket id to use for the generated video. This can be used to control the motion of the generated video. Increasing the motion bucket id will increase the motion of the generated video.
- `noise_aug_strength`: The amount of noise added to the conditioning image. The higher the values the less the video will resemble the conditioning image. Increasing this value will also increase the motion of the generated video.
-
-Here is an example of using micro-conditioning to generate a video with more motion.
-
-
-```python
-import torch
-
-from diffusers import StableVideoDiffusionPipeline
-from diffusers.utils import load_image, export_to_video
-
-pipe = StableVideoDiffusionPipeline.from_pretrained(
-  "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
-)
-pipe.enable_model_cpu_offload()
-
-# Load the conditioning image
-image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png?download=true")
-image = image.resize((1024, 576))
-
-generator = torch.manual_seed(42)
-frames = pipe(image, decode_chunk_size=8, generator=generator, motion_bucket_id=180, noise_aug_strength=0.1).frames[0]
-export_to_video(frames, "generated.mp4", fps=7)
-```
-
-<video width="1024" height="576" controls>
-  <source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket_generated_motion.mp4" type="video/mp4">
-</video>
-
@@ -14,41 +14,54 @@ specific language governing permissions and limitations under the License.

 [[open-in-colab]]

-Unconditional image generation generates images that look like a random sample from the training data the model was trained on because the denoising process is not guided by any additional context like text or image.
+Unconditional image generation is a relatively straightforward task. The model only generates images - without any additional context like text or an image - resembling the training data it was trained on.

-To get started, use the [`DiffusionPipeline`] to load the [anton-l/ddpm-butterflies-128](https://huggingface.co/anton-l/ddpm-butterflies-128) checkpoint to generate images of butterflies. The [`DiffusionPipeline`] downloads and caches all the model components required to generate an image.
+The [`DiffusionPipeline`] is the easiest way to use a pre-trained diffusion system for inference.

-```py
+Start by creating an instance of [`DiffusionPipeline`] and specify which pipeline checkpoint you would like to download.
+You can use any of the 🧨 Diffusers [checkpoints](https://huggingface.co/models?library=diffusers&sort=downloads) from the Hub (the checkpoint you'll use generates images of butterflies).
+
+<Tip>
+
+💡 Want to train your own unconditional image generation model? Take a look at the training [guide](../training/unconditional_training) to learn how to generate your own images.
+
+</Tip>
+
+In this guide, you'll use [`DiffusionPipeline`] for unconditional image generation with [DDPM](https://arxiv.org/abs/2006.11239):
+
+```python
 from diffusers import DiffusionPipeline

-generator = DiffusionPipeline.from_pretrained("anton-l/ddpm-butterflies-128").to("cuda")
+generator = DiffusionPipeline.from_pretrained("anton-l/ddpm-butterflies-128", use_safetensors=True)
+```
+
+The [`DiffusionPipeline`] downloads and caches all modeling, tokenization, and scheduling components.
+Because the model consists of roughly 1.4 billion parameters, we strongly recommend running it on a GPU.
+You can move the generator object to a GPU, just like you would in PyTorch:
+
+```python
+generator.to("cuda")
+```
+
+Now you can use the `generator` to generate an image:
+
+```python
 image = generator().images[0]
 image
 ```

-<Tip>
+The output is by default wrapped into a [`PIL.Image`](https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=image#the-image-class) object.

-Want to generate images of something else? Take a look at the training [guide](../training/unconditional_training) to learn how to train a model to generate your own images.
+You can save the image by calling:

-</Tip>
-
-The output image is a [`PIL.Image`](https://pillow.readthedocs.io/en/stable/reference/Image.html?highlight=image#the-image-class) object that can be saved:
-
-```py
+```python
 image.save("generated_image.png")
 ```

-You can also try experimenting with the `num_inference_steps` parameter, which controls the number of denoising steps. More denoising steps typically produce higher quality images, but it'll take longer to generate. Feel free to play around with this parameter to see how it affects the image quality.
-
-```py
-image = generator(num_inference_steps=100).images[0]
-image
-```
-
-Try out the Space below to generate an image of a butterfly!
+Try out the Spaces below, and feel free to play around with the inference steps parameter to see how it affects the image quality!

 <iframe
-	src="https://stevhliu-unconditional-image-generation.hf.space"
+	src="https://stevhliu-ddpm-butterflies-128.hf.space"
 	frameborder="0"
 	width="850"
 	height="500"
@@ -96,4 +96,3 @@ specific language governing permissions and limitations under the License.
 | [versatile_diffusion](./api/pipelines/versatile_diffusion) | [Versatile Diffusion: Text, Images and Variations All in One Diffusion Model](https://arxiv.org/abs/2211.08332) | Dual Image and Text Guided Generation |
 | [vq_diffusion](./api/pipelines/vq_diffusion) | [Vector Quantized Diffusion Model for Text-to-Image Synthesis](https://arxiv.org/abs/2111.14822) | Text-to-Image Generation |
 | [stable_diffusion_ldm3d](./api/pipelines/stable_diffusion/ldm3d_diffusion) | [LDM3D: Latent Diffusion Model for 3D](https://arxiv.org/abs/2305.10853) | Text to Image and Depth Generation |
-| [stable_diffusion_upscaler_ldm3d](./api/pipelines/stable_diffusion/ldm3d_diffusion) | [LDM3D-VR: Latent Diffusion Model for 3D VR](https://arxiv.org/pdf/2311.03226) | Image and Depth Upscaling |
@@ -54,7 +54,7 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.loaders import LoraLoaderMixin
-from diffusers.models.lora import LoRALinearLayer
+from diffusers.models.lora import LoRALinearLayer, text_encoder_lora_state_dict
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import compute_snr, unet_lora_state_dict
 from diffusers.utils import check_min_version, is_wandb_available
@@ -62,51 +62,16 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__)


-# TODO: This function should be removed once training scripts are rewritten in PEFT
-def text_encoder_lora_state_dict(text_encoder):
-    state_dict = {}
-
-    def text_encoder_attn_modules(text_encoder):
-        from transformers import CLIPTextModel, CLIPTextModelWithProjection
-
-        attn_modules = []
-
-        if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
-            for i, layer in enumerate(text_encoder.text_model.encoder.layers):
-                name = f"text_model.encoder.layers.{i}.self_attn"
-                mod = layer.self_attn
-                attn_modules.append((name, mod))
-
-        return attn_modules
-
-    for name, module in text_encoder_attn_modules(text_encoder):
-        for k, v in module.q_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.q_proj.lora_linear_layer.{k}"] = v
-
-        for k, v in module.k_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.k_proj.lora_linear_layer.{k}"] = v
-
-        for k, v in module.v_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.v_proj.lora_linear_layer.{k}"] = v
-
-        for k, v in module.out_proj.lora_linear_layer.state_dict().items():
-            state_dict[f"{name}.out_proj.lora_linear_layer.{k}"] = v
-
-    return state_dict
-
-
 def save_model_card(
    repo_id: str,
    images=None,
    base_model=str,
    train_text_encoder=False,
-    train_text_encoder_ti=False,
-    token_abstraction_dict=None,
    instance_prompt=str,
    validation_prompt=str,
    repo_folder=None,
@@ -118,23 +83,10 @@ def save_model_card(
        img_str += f"""
        - text: '{validation_prompt if validation_prompt else ' ' }'
          output:
-            url:
+            url: >-
                "image_{i}.png"
        """

-    trigger_str = f"You should use {instance_prompt} to trigger the image generation."
-    if train_text_encoder_ti:
-        trigger_str = (
-            "To trigger image generation of trained concept(or concepts) replace each concept identifier "
-            "in you prompt with the new inserted tokens:\n"
-        )
-        if token_abstraction_dict:
-            for key, value in token_abstraction_dict.items():
-                tokens = "".join(value)
-                trigger_str += f"""
-to trigger concept `{key}->` use `{tokens}` in your prompt \n
-"""
-
    yaml = f"""
 ---
 tags:
@@ -144,7 +96,9 @@ tags:
 - diffusers
 - lora
 - template:sd-lora
+widget:
 {img_str}
+---
 base_model: {base_model}
 instance_prompt: {instance_prompt}
 license: openrail++
@@ -158,19 +112,14 @@ license: openrail++

 ## Model description

-### These are {repo_id} LoRA adaption weights for {base_model}.
-
+These are {repo_id} LoRA adaption weights for {base_model}.
 The weights were trained  using [DreamBooth](https://dreambooth.github.io/).
-
 LoRA for the text encoder was enabled: {train_text_encoder}.
-
-Pivotal tuning was enabled: {train_text_encoder_ti}.
-
 Special VAE used for training: {vae_path}.

 ## Trigger words

-{trigger_str}
+You should use {instance_prompt} to trigger the image generation.

 ## Download model

@@ -225,12 +174,6 @@ def parse_args(input_args=None):
        required=False,
        help="Revision of pretrained model identifier from huggingface.co/models.",
    )
-    parser.add_argument(
-        "--variant",
-        type=str,
-        default=None,
-        help="Variant of the model files of the pretrained model identifier from huggingface.co/models, 'e.g.' fp16",
-    )
    parser.add_argument(
        "--dataset_name",
        type=str,
@@ -238,26 +181,20 @@ def parse_args(input_args=None):
        help=(
            "The name of the Dataset (from the HuggingFace hub) containing the training data of instance images (could be your own, possibly private,"
            " dataset). It can also be a path pointing to a local copy of a dataset in your filesystem,"
-            " or to a folder containing files that 🤗 Datasets can understand.To load the custom captions, the training set directory needs to follow the structure of a "
-            "datasets ImageFolder, containing both the images and the corresponding caption for each image. see: "
-            "https://huggingface.co/docs/datasets/image_dataset for more information"
+            " or to a folder containing files that 🤗 Datasets can understand."
        ),
    )
    parser.add_argument(
        "--dataset_config_name",
        type=str,
        default=None,
-        help="The config of the Dataset. In some cases, a dataset may have more than one configuration (for example "
-        "if it contains different subsets of data within, and you only wish to load a specific subset - in that case specify the desired configuration using --dataset_config_name. Leave as "
-        "None if there's only one config.",
+        help="The config of the Dataset, leave as None if there's only one config.",
    )
    parser.add_argument(
        "--instance_data_dir",
        type=str,
        default=None,
-        help="A path to local folder containing the training data of instance images. Specify this arg instead of "
-        "--dataset_name if you wish to train using a local folder without custom captions. If you wish to train with custom captions please specify "
-        "--dataset_name instead.",
+        help=("A folder containing the training data. "),
    )

    parser.add_argument(
@@ -300,18 +237,15 @@ def parse_args(input_args=None):
    )
    parser.add_argument(
        "--token_abstraction",
-        type=str,
        default="TOK",
        help="identifier specifying the instance(or instances) as used in instance_prompt, validation prompt, "
-        "captions - e.g. TOK. To use multiple identifiers, please specify them in a comma seperated string - e.g. "
-        "'TOK,TOK2,TOK3' etc.",
+        "captions - e.g. TOK",
    )

    parser.add_argument(
        "--num_new_tokens_per_abstraction",
-        type=int,
        default=2,
-        help="number of new tokens inserted to the tokenizers per token_abstraction identifier when "
+        help="number of new tokens inserted to the tokenizers per token_abstraction value when "
        "--train_text_encoder_ti = True. By default, each --token_abstraction (e.g. TOK) is mapped to 2 new "
        "tokens - <si><si+1> ",
    )
@@ -521,7 +455,7 @@ def parse_args(input_args=None):
    parser.add_argument(
        "--train_text_encoder_frac",
        type=float,
-        default=1.0,
+        default=0.5,
        help=("The percentage of epochs to perform text encoder tuning"),
    )

@@ -554,7 +488,7 @@ def parse_args(input_args=None):
    parser.add_argument("--prodigy_decouple", type=bool, default=True, help="Use AdamW style decoupled weight decay")
    parser.add_argument("--adam_weight_decay", type=float, default=1e-04, help="Weight decay to use for unet params")
    parser.add_argument(
-        "--adam_weight_decay_text_encoder", type=float, default=None, help="Weight decay to use for text_encoder"
+        "--adam_weight_decay_text_encoder", type=float, default=1e-03, help="Weight decay to use for text_encoder"
    )

    parser.add_argument(
@@ -662,6 +596,17 @@ def parse_args(input_args=None):
            "inversion training check `--train_text_encoder_ti`"
        )

+    if args.train_text_encoder_ti:
+        if isinstance(args.token_abstraction, str):
+            args.token_abstraction = [args.token_abstraction]
+        elif isinstance(args.token_abstraction, List):
+            args.token_abstraction = args.token_abstraction
+        else:
+            raise ValueError(
+                f"Unsupported type for --args.token_abstraction: {type(args.token_abstraction)}. "
+                f"Supported types are: str (for a single instance identifier) or List[str] (for multiple concepts)"
+            )
+
    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
    if env_local_rank != -1 and env_local_rank != args.local_rank:
        args.local_rank = env_local_rank
@@ -734,19 +679,12 @@ class TokenEmbeddingsHandler:
    def save_embeddings(self, file_path: str):
        assert self.train_ids is not None, "Initialize new tokens before saving embeddings."
        tensors = {}
-        # text_encoder_0 - CLIP ViT-L/14, text_encoder_1 -  CLIP ViT-G/14
-        idx_to_text_encoder_name = {0: "clip_l", 1: "clip_g"}
        for idx, text_encoder in enumerate(self.text_encoders):
            assert text_encoder.text_model.embeddings.token_embedding.weight.data.shape[0] == len(
                self.tokenizers[0]
            ), "Tokenizers should be the same."
            new_token_embeddings = text_encoder.text_model.embeddings.token_embedding.weight.data[self.train_ids]
-
-            # New tokens for each text encoder are saved under "clip_l" (for text_encoder 0), "clip_g" (for
-            # text_encoder 1) to keep compatible with the ecosystem.
-            # Note: When loading with diffusers, any name can work - simply specify in inference
-            tensors[idx_to_text_encoder_name[idx]] = new_token_embeddings
-            # tensors[f"text_encoders_{idx}"] = new_token_embeddings
+            tensors[f"text_encoders_{idx}"] = new_token_embeddings

        save_file(tensors, file_path)

@@ -758,6 +696,19 @@ class TokenEmbeddingsHandler:
    def device(self):
        return self.text_encoders[0].device

+    # def _load_embeddings(self, loaded_embeddings, tokenizer, text_encoder):
+    #     # Assuming new tokens are of the format <s_i>
+    #     self.inserting_toks = [f"<s{i}>" for i in range(loaded_embeddings.shape[0])]
+    #     special_tokens_dict = {"additional_special_tokens": self.inserting_toks}
+    #     tokenizer.add_special_tokens(special_tokens_dict)
+    #     text_encoder.resize_token_embeddings(len(tokenizer))
+    #
+    #     self.train_ids = tokenizer.convert_tokens_to_ids(self.inserting_toks)
+    #     assert self.train_ids is not None, "New tokens could not be converted to IDs."
+    #     text_encoder.text_model.embeddings.token_embedding.weight.data[
+    #         self.train_ids
+    #     ] = loaded_embeddings.to(device=self.device).to(dtype=self.dtype)
+
    @torch.no_grad()
    def retract_embeddings(self):
        for idx, text_encoder in enumerate(self.text_encoders):
@@ -779,6 +730,15 @@ class TokenEmbeddingsHandler:
            new_embeddings = new_embeddings * (off_ratio**0.1)
            text_encoder.text_model.embeddings.token_embedding.weight.data[index_updates] = new_embeddings

+    # def load_embeddings(self, file_path: str):
+    #     with safe_open(file_path, framework="pt", device=self.device.type) as f:
+    #         for idx in range(len(self.text_encoders)):
+    #             text_encoder = self.text_encoders[idx]
+    #             tokenizer = self.tokenizers[idx]
+    #
+    #             loaded_embeddings = f.get_tensor(f"text_encoders_{idx}")
+    #             self._load_embeddings(loaded_embeddings, tokenizer, text_encoder)
+

 class DreamBoothDataset(Dataset):
    """
@@ -1061,7 +1021,6 @@ def main(args):
                args.pretrained_model_name_or_path,
                torch_dtype=torch_dtype,
                revision=args.revision,
-                variant=args.variant,
            )
            pipeline.set_progress_bar_config(disable=True)

@@ -1100,18 +1059,10 @@ def main(args):

    # Load the tokenizers
    tokenizer_one = AutoTokenizer.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="tokenizer",
-        revision=args.revision,
-        variant=args.variant,
-        use_fast=False,
+        args.pretrained_model_name_or_path, subfolder="tokenizer", revision=args.revision, use_fast=False
    )
    tokenizer_two = AutoTokenizer.from_pretrained(
-        args.pretrained_model_name_or_path,
-        subfolder="tokenizer_2",
-        revision=args.revision,
-        variant=args.variant,
-        use_fast=False,
+        args.pretrained_model_name_or_path, subfolder="tokenizer_2", revision=args.revision, use_fast=False
    )

    # import correct text encoder classes
@@ -1125,10 +1076,10 @@ def main(args):
    # Load scheduler and models
    noise_scheduler = DDPMScheduler.from_pretrained(args.pretrained_model_name_or_path, subfolder="scheduler")
    text_encoder_one = text_encoder_cls_one.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
    )
    text_encoder_two = text_encoder_cls_two.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision
    )
    vae_path = (
        args.pretrained_model_name_or_path
@@ -1136,24 +1087,16 @@ def main(args):
        else args.pretrained_vae_model_name_or_path
    )
    vae = AutoencoderKL.from_pretrained(
-        vae_path,
-        subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
-        revision=args.revision,
-        variant=args.variant,
+        vae_path, subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None, revision=args.revision
    )
    unet = UNet2DConditionModel.from_pretrained(
-        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
+        args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision
    )

    if args.train_text_encoder_ti:
-        # we parse the provided token identifier (or identifiers) into a list. s.t. - "TOK" -> ["TOK"], "TOK,
-        # TOK2" -> ["TOK", "TOK2"] etc.
-        token_abstraction_list = "".join(args.token_abstraction.split()).split(",")
-        logger.info(f"list of token identifiers: {token_abstraction_list}")
-
        token_abstraction_dict = {}
        token_idx = 0
-        for i, token in enumerate(token_abstraction_list):
+        for i, token in enumerate(args.token_abstraction):
            token_abstraction_dict[token] = [
                f"<s{token_idx + i + j}>" for j in range(args.num_new_tokens_per_abstraction)
            ]
@@ -1273,8 +1216,6 @@ def main(args):
        text_lora_parameters_one = []
        for name, param in text_encoder_one.named_parameters():
            if "token_embedding" in name:
-                # ensure that dtype is float32, even if rest of the model that isn't trained is loaded in fp16
-                param = param.to(dtype=torch.float32)
                param.requires_grad = True
                text_lora_parameters_one.append(param)
            else:
@@ -1282,8 +1223,6 @@ def main(args):
        text_lora_parameters_two = []
        for name, param in text_encoder_two.named_parameters():
            if "token_embedding" in name:
-                # ensure that dtype is float32, even if rest of the model that isn't trained is loaded in fp16
-                param = param.to(dtype=torch.float32)
                param.requires_grad = True
                text_lora_parameters_two.append(param)
            else:
@@ -1370,16 +1309,12 @@ def main(args):
        # different learning rate for text encoder and unet
        text_lora_parameters_one_with_lr = {
            "params": text_lora_parameters_one,
-            "weight_decay": args.adam_weight_decay_text_encoder
-            if args.adam_weight_decay_text_encoder
-            else args.adam_weight_decay,
+            "weight_decay": args.adam_weight_decay_text_encoder,
            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
        }
        text_lora_parameters_two_with_lr = {
            "params": text_lora_parameters_two,
-            "weight_decay": args.adam_weight_decay_text_encoder
-            if args.adam_weight_decay_text_encoder
-            else args.adam_weight_decay,
+            "weight_decay": args.adam_weight_decay_text_encoder,
            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
        }
        params_to_optimize = [
@@ -1559,12 +1494,6 @@ def main(args):
                tokens_one = torch.cat([tokens_one, class_tokens_one], dim=0)
                tokens_two = torch.cat([tokens_two, class_tokens_two], dim=0)

-    if args.train_text_encoder_ti and args.validation_prompt:
-        # replace instances of --token_abstraction in validation prompt with the new tokens: "<si><si+1>" etc.
-        for token_abs, token_replacement in train_dataset.token_abstraction_dict.items():
-            args.validation_prompt = args.validation_prompt.replace(token_abs, "".join(token_replacement))
-    print("validation prompt:", args.validation_prompt)
-
    # Scheduler and math around the number of training steps.
    overrode_max_train_steps = False
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
@@ -1664,10 +1593,27 @@ def main(args):
            if epoch == num_train_epochs_text_encoder:
                print("PIVOT HALFWAY", epoch)
                # stopping optimization of text_encoder params
-                # re setting the optimizer to optimize only on unet params
-                optimizer.param_groups[1]["lr"] = 0.0
-                optimizer.param_groups[2]["lr"] = 0.0
-
+                params_to_optimize = params_to_optimize[:1]
+                # reinitializing the optimizer to optimize only on unet params
+                if args.optimizer.lower() == "prodigy":
+                    optimizer = optimizer_class(
+                        params_to_optimize,
+                        lr=args.learning_rate,
+                        betas=(args.adam_beta1, args.adam_beta2),
+                        beta3=args.prodigy_beta3,
+                        weight_decay=args.adam_weight_decay,
+                        eps=args.adam_epsilon,
+                        decouple=args.prodigy_decouple,
+                        use_bias_correction=args.prodigy_use_bias_correction,
+                        safeguard_warmup=args.prodigy_safeguard_warmup,
+                    )
+                else:  # AdamW or 8-bit-AdamW
+                    optimizer = optimizer_class(
+                        params_to_optimize,
+                        betas=(args.adam_beta1, args.adam_beta2),
+                        weight_decay=args.adam_weight_decay,
+                        eps=args.adam_epsilon,
+                    )
            else:
                # still optimizng the text encoder
                text_encoder_one.train()
@@ -1682,7 +1628,7 @@ def main(args):
            with accelerator.accumulate(unet):
                pixel_values = batch["pixel_values"].to(dtype=vae.dtype)
                prompts = batch["prompts"]
-                # print(prompts)
+                print(prompts)
                # encode batch prompts when custom prompts are provided for each image -
                if train_dataset.custom_instance_prompts:
                    if freeze_text_encoder:
@@ -1855,18 +1801,12 @@ def main(args):
                    f" {args.validation_prompt}."
                )
                # create pipeline
-                if freeze_text_encoder:
+                if not args.train_text_encoder:
                    text_encoder_one = text_encoder_cls_one.from_pretrained(
-                        args.pretrained_model_name_or_path,
-                        subfolder="text_encoder",
-                        revision=args.revision,
-                        variant=args.variant,
+                        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision
                    )
                    text_encoder_two = text_encoder_cls_two.from_pretrained(
-                        args.pretrained_model_name_or_path,
-                        subfolder="text_encoder_2",
-                        revision=args.revision,
-                        variant=args.variant,
+                        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision
                    )
                pipeline = StableDiffusionXLPipeline.from_pretrained(
                    args.pretrained_model_name_or_path,
@@ -1875,7 +1815,6 @@ def main(args):
                    text_encoder_2=accelerator.unwrap_model(text_encoder_two),
                    unet=accelerator.unwrap_model(unet),
                    revision=args.revision,
-                    variant=args.variant,
                    torch_dtype=weight_dtype,
                )

@@ -1953,15 +1892,10 @@ def main(args):
            vae_path,
            subfolder="vae" if args.pretrained_vae_model_name_or_path is None else None,
            revision=args.revision,
-            variant=args.variant,
            torch_dtype=weight_dtype,
        )
        pipeline = StableDiffusionXLPipeline.from_pretrained(
-            args.pretrained_model_name_or_path,
-            vae=vae,
-            revision=args.revision,
-            variant=args.variant,
-            torch_dtype=weight_dtype,
+            args.pretrained_model_name_or_path, vae=vae, revision=args.revision, torch_dtype=weight_dtype
        )

        # We train on the simplified learning objective. If we were previously predicting a variance, we need the scheduler to ignore it
@@ -2014,8 +1948,6 @@ def main(args):
                images=images,
                base_model=args.pretrained_model_name_or_path,
                train_text_encoder=args.train_text_encoder,
-                train_text_encoder_ti=args.train_text_encoder_ti,
-                token_abstraction_dict=train_dataset.token_abstraction_dict,
                instance_prompt=args.instance_prompt,
                validation_prompt=args.validation_prompt,
                repo_folder=args.output_dir,
@@ -48,9 +48,7 @@ prompt-to-prompt | change parts of a prompt and retain image structure (see [pap
 |   Latent Consistency Pipeline                                                                                                    | Implementation of [Latent Consistency Models: Synthesizing High-Resolution Images with Few-Step Inference](https://arxiv.org/abs/2310.04378)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [Latent Consistency Pipeline](#latent-consistency-pipeline)      | - |              [Simian Luo](https://github.com/luosiallen) |
 |   Latent Consistency Img2img Pipeline                                                                                                    | Img2img pipeline for Latent Consistency Models                                                                                                                                                                                                                                                                                                                                                                                                                                    | [Latent Consistency Img2Img Pipeline](#latent-consistency-img2img-pipeline)      | - |              [Logan Zoellner](https://github.com/nagolinc) |
 |   Latent Consistency Interpolation Pipeline                                                                                                    | Interpolate the latent space of Latent Consistency Models with multiple prompts                                                                                                                                                                                                                                                                                                                                                                                                                                    | [Latent Consistency Interpolation Pipeline](#latent-consistency-interpolation-pipeline) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1pK3NrLWJSiJsBynLns1K1-IDTW9zbPvl?usp=sharing) | [Aryan V S](https://github.com/a-r-r-o-w) |
-|   Regional Prompting Pipeline                                                                                               | Assign multiple prompts for different regions                                                                                                                                                                                                                                                                                                                                                    |  [Regional Prompting Pipeline](#regional-prompting-pipeline) | - | [hako-mikan](https://github.com/hako-mikan) |
-| LDM3D-sr (LDM3D upscaler)                                                                                                             | Upscale low resolution RGB and depth inputs to high resolution                                                                                                                                                                                                                                                                                                                                                                                                                              | [StableDiffusionUpscaleLDM3D Pipeline](https://github.com/estelleafl/diffusers/tree/ldm3d_upscaler_community/examples/community#stablediffusionupscaleldm3d-pipeline)                                                                             | -                                                                                                                                                                                                             |                                                        [Estelle Aflalo](https://github.com/estelleafl) |
-|   DemoFusion Pipeline                                                                                                    | Implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [DemoFusion Pipeline](#DemoFusion)      | - |              [Ruoyi Du](https://github.com/RuoyiDu) |
+

 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
 ```py
@@ -78,7 +76,6 @@ from diffusers import DiffusionPipeline
 pipe = DiffusionPipeline.from_pretrained(
    "longlian/lmd_plus", 
    custom_pipeline="llm_grounded_diffusion",
-    custom_revision="main",
    variant="fp16", torch_dtype=torch.float16
 )
 pipe.enable_model_cpu_offload()
@@ -2347,47 +2344,6 @@ images = pipe(
 assert len(images) == (len(prompts) - 1) * num_interpolation_steps
 ```

-###  StableDiffusionUpscaleLDM3D Pipeline
-[LDM3D-VR](https://arxiv.org/pdf/2311.03226.pdf) is an extended version of LDM3D. 
-
-The abstract from the paper is:
-*Latent diffusion models have proven to be state-of-the-art in the creation and manipulation of visual outputs. However, as far as we know, the generation of depth maps jointly with RGB is still limited. We introduce LDM3D-VR, a suite of diffusion models targeting virtual reality development that includes LDM3D-pano and LDM3D-SR. These models enable the generation of panoramic RGBD based on textual prompts and the upscaling of low-resolution inputs to high-resolution RGBD, respectively. Our models are fine-tuned from existing pretrained models on datasets containing panoramic/high-resolution RGB images, depth maps and captions. Both models are evaluated in comparison to existing related methods*
-
-Two checkpoints are available for use:
- [ldm3d-pano](https://huggingface.co/Intel/ldm3d-pano). This checkpoint enables the generation of panoramic images and requires the StableDiffusionLDM3DPipeline pipeline to be used.
- [ldm3d-sr](https://huggingface.co/Intel/ldm3d-sr). This checkpoint enables the upscaling of RGB and depth images. Can be used in cascade after the original LDM3D pipeline using the StableDiffusionUpscaleLDM3DPipeline pipeline.
-
-'''py
-from PIL import Image
-import os
-import torch
-from diffusers import StableDiffusionLDM3DPipeline, DiffusionPipeline
-
-#Generate a rgb/depth output from LDM3D
-pipe_ldm3d = StableDiffusionLDM3DPipeline.from_pretrained("Intel/ldm3d-4c")
-pipe_ldm3d.to("cuda")
-
-prompt =f"A picture of some lemons on a table"
-output = pipe_ldm3d(prompt)
-rgb_image, depth_image = output.rgb, output.depth
-rgb_image[0].save(f"lemons_ldm3d_rgb.jpg")
-depth_image[0].save(f"lemons_ldm3d_depth.png")
-
-
-#Upscale the previous output to a resolution of (1024, 1024)
-pipe_ldm3d_upscale = DiffusionPipeline.from_pretrained("Intel/ldm3d-sr", custom_pipeline="pipeline_stable_diffusion_upscale_ldm3d")
-
-pipe_ldm3d_upscale.to("cuda")
-
-low_res_img = Image.open(f"lemons_ldm3d_rgb.jpg").convert("RGB")
-low_res_depth = Image.open(f"lemons_ldm3d_depth.png").convert("L")
-outputs = pipe_ldm3d_upscale(prompt="high quality high resolution uhd 4k image", rgb=low_res_img, depth=low_res_depth, num_inference_steps=50, target_res=[1024, 1024])
-
-upscaled_rgb, upscaled_depth =outputs.rgb[0], outputs.depth[0]
-upscaled_rgb.save(f"upscaled_lemons_rgb.png")
-upscaled_depth.save(f"upscaled_lemons_depth.png")
-'''
-
 ### ControlNet + T2I Adapter Pipeline
 This pipelines combines both ControlNet and T2IAdapter into a single pipeline, where the forward pass is executed once. 
 It receives `control_image` and `adapter_image`, as well as `controlnet_conditioning_scale` and `adapter_conditioning_scale`, for the ControlNet and Adapter modules, respectively. Whenever `adapter_conditioning_scale = 0` or `controlnet_conditioning_scale = 0`, it will act as a full ControlNet module or as a full T2IAdapter module, respectively. 
@@ -2526,181 +2482,6 @@ images[0].save("controlnet_and_adapter_inpaint.png")

 ```

-### Regional Prompting Pipeline
-This pipeline is a port of the [Regional Prompter extension](https://github.com/hako-mikan/sd-webui-regional-prompter) for [Stable Diffusion web UI](https://github.com/AUTOMATIC1111/stable-diffusion-webui) to diffusers.
-This code implements a pipeline for the Stable Diffusion model, enabling the division of the canvas into multiple regions, with different prompts applicable to each region. Users can specify regions in two ways: using `Cols` and `Rows` modes for grid-like divisions, or the `Prompt` mode for regions calculated based on prompts.
-
-![sample](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/imgs/rp_pipeline1.png)
-
-### Usage
-### Sample Code
-```
-from from examples.community.regional_prompting_stable_diffusion import RegionalPromptingStableDiffusionPipeline
-pipe = RegionalPromptingStableDiffusionPipeline.from_single_file(model_path, vae=vae)
-
-rp_args = {
-    "mode":"rows",
-    "div": "1;1;1"
-}  
-
-prompt ="""
-green hair twintail BREAK
-red blouse BREAK
-blue skirt
-"""
-
-images = pipe(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    guidance_scale=7.5,
-    height = 768,
-    width = 512,
-    num_inference_steps =20,
-    num_images_per_prompt = 1,
-    rp_args = rp_args
-        ).images
-
-time = time.strftime(r"%Y%m%d%H%M%S")
-i = 1
-for image in images:
-    i += 1
-    fileName = f'img-{time}-{i+1}.png'
-    image.save(fileName)
-```
-### Cols, Rows mode
-In the Cols, Rows mode, you can split the screen vertically and horizontally and assign prompts to each region. The split ratio can be specified by 'div', and you can set the division ratio like '3;3;2' or '0.1;0.5'. Furthermore, as will be described later, you can also subdivide the split Cols, Rows to specify more complex regions.
-
-In this image, the image is divided into three parts, and a separate prompt is applied to each. The prompts are divided by 'BREAK', and each is applied to the respective region.  
-![sample](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/imgs/rp_pipeline2.png)
-```
-green hair twintail BREAK
-red blouse BREAK
-blue skirt
-```
-
-### 2-Dimentional division
-The prompt consists of instructions separated by the term `BREAK` and is assigned to different regions of a two-dimensional space. The image is initially split in the main splitting direction, which in this case is rows, due to the presence of a single semicolon`;`, dividing the space into an upper and a lower section. Additional sub-splitting is then applied, indicated by commas. The upper row is split into ratios of `2:1:1`, while the lower row is split into a ratio of `4:6`. Rows themselves are split in a `1:2` ratio. According to the reference image, the blue sky is designated as the first region, green hair as the second, the bookshelf as the third, and so on, in a sequence based on their position from the top left. The terrarium is placed on the desk in the fourth region, and the orange dress and sofa are in the fifth region, conforming to their respective splits.
-```
-rp_args = {
-    "mode":"rows",
-    "div": "1,2,1,1;2,4,6"
-}
-
-prompt ="""
-blue sky BREAK
-green hair BREAK
-book shelf BREAK
-terrarium on desk BREAK
-orange dress and sofa
-"""
-```
-![sample](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/imgs/rp_pipeline4.png)
-
-### Prompt Mode
-There are limitations to methods of specifying regions in advance. This is because specifying regions can be a hindrance when designating complex shapes or dynamic compositions. In the region specified by the prompt, the regions is determined after the image generation has begun. This allows us to accommodate compositions and complex regions.
-For further infomagen, see [here](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/main/prompt_en.md).
-### syntax
-```
-baseprompt target1 target2 BREAK
-effect1, target1 BREAK
-effect2 ,target2
-```
-
-First, write the base prompt. In the base prompt, write the words (target1, target2) for which you want to create a mask. Next, separate them with BREAK. Next, write the prompt corresponding to target1. Then enter a comma and write target1. The order of the targets in the base prompt and the order of the BREAK-separated targets can be back to back.
-
-```
-target2 baseprompt target1  BREAK
-effect1, target1 BREAK
-effect2 ,target2
-```
-is also effective.
-
-### Sample
-In this example, masks are calculated for shirt, tie, skirt, and color prompts are specified only for those regions.
-```
-rp_args = {
-    "mode":"prompt-ex",
-    "save_mask":True,
-    "th": "0.4,0.6,0.6",
-}
-
-prompt ="""
-a girl in street with shirt, tie, skirt BREAK
-red, shirt BREAK
-green, tie BREAK
-blue , skirt 
-"""
-```
-![sample](https://github.com/hako-mikan/sd-webui-regional-prompter/blob/imgs/rp_pipeline3.png)
-### threshold
-The threshold used to determine the mask created by the prompt. This can be set as many times as there are masks, as the range varies widely depending on the target prompt. If multiple regions are used, enter them separated by commas. For example, hair tends to be ambiguous and requires a small value, while face tends to be large and requires a small value. These should be ordered by BREAK.
-
-```
-a lady ,hair, face  BREAK
-red, hair BREAK
-tanned ,face
-```
-`threshold : 0.4,0.6`
-If only one input is given for multiple regions, they are all assumed to be the same value.
-
-### Prompt and Prompt-EX
-The difference is that in Prompt, duplicate regions are added, whereas in Prompt-EX, duplicate regions are overwritten sequentially. Since they are processed in order, setting a TARGET with a large regions first makes it easier for the effect of small regions to remain unmuffled.
-
-### Accuracy
-In the case of a 512 x 512 image, Attention mode reduces the size of the region to about 8 x 8 pixels deep in the U-Net, so that small regions get mixed up; Latent mode calculates 64*64, so that the region is exact.  
-```
-girl hair twintail frills,ribbons, dress, face BREAK
-girl, ,face
-```
-
-### Mask
-When an image is generated, the generated mask is displayed. It is generated at the same size as the image, but is actually used at a much smaller size.
-
-
-### Use common prompt
-You can attach the prompt up to ADDCOMM to all prompts by separating it first with ADDCOMM. This is useful when you want to include elements common to all regions. For example, when generating pictures of three people with different appearances, it's necessary to include the instruction of 'three people' in all regions. It's also useful when inserting quality tags and other things."For example, if you write as follows:
-```
-best quality, 3persons in garden, ADDCOMM
-a girl white dress BREAK
-a boy blue shirt BREAK
-an old man red suit
-```
-If common is enabled, this prompt is converted to the following:
-```
-best quality, 3persons in garden, a girl white dress BREAK
-best quality, 3persons in garden, a boy blue shirt BREAK
-best quality, 3persons in garden, an old man red suit
-```
-### Negative prompt
-Negative prompts are equally effective across all regions, but it is possible to set region-specific prompts for negative prompts as well. The number of BREAKs must be the same as the number of prompts. If the number of prompts does not match, the negative prompts will be used without being divided into regions.
-
-### Parameters
-To activate Regional Prompter, it is necessary to enter settings in rp_args. The items that can be set are as follows. rp_args is a dictionary type.
-
-### Input Parameters
-Parameters are specified through the `rp_arg`(dictionary type).  
-
-```
-rp_args = {
-    "mode":"rows",
-    "div": "1;1;1"
-}  
-
-pipe(prompt =prompt, rp_args = rp_args)
-```
-
-
-
-### Required Parameters
- `mode`: Specifies the method for defining regions. Choose from `Cols`, `Rows`, `Prompt` or `Prompt-Ex`. This parameter is case-insensitive.
- `divide`: Used in `Cols` and `Rows` modes. Details on how to specify this are provided under the respective `Cols` and `Rows` sections.
- `th`: Used in `Prompt` mode. The method of specification is detailed under the `Prompt` section.
-
-### Optional Parameters
- `save_mask`: In `Prompt` mode, choose whether to output the generated mask along with the image. The default is `False`.
-
-The Pipeline supports `compel` syntax. Input prompts using the `compel` structure will be automatically applied and processed.
-
 ## Diffusion Posterior Sampling Pipeline
 * Reference paper
    ```
@@ -2842,82 +2623,3 @@ The Pipeline supports `compel` syntax. Input prompts using the `compel` structur
    * ![dps_mea](https://github.com/tongdaxu/Images/assets/22267548/ff6a33d6-26f0-42aa-88ce-f8a76ba45a13)
    * Reconstructed image:
    * ![dps_generated_image](https://github.com/tongdaxu/Images/assets/22267548/b74f084d-93f4-4845-83d8-44c0fa758a5f)
-
-### DemoFusion
-This pipeline is the official implementation of [DemoFusion: Democratising High-Resolution Image Generation With No $$$](https://arxiv.org/abs/2311.16973).
-The original repo can be found at [repo](https://github.com/PRIS-CV/DemoFusion).
- `view_batch_size` (`int`, defaults to 16):
-  The batch size for multiple denoising paths. Typically, a larger batch size can result in higher efficiency but comes with increased GPU memory requirements.
-
- `stride` (`int`, defaults to 64):
-  The stride of moving local patches. A smaller stride is better for alleviating seam issues, but it also introduces additional computational overhead and inference time.
-
- `cosine_scale_1` (`float`, defaults to 3):
-  Control the strength of skip-residual. For specific impacts, please refer to Appendix C in the DemoFusion paper.
-
- `cosine_scale_2` (`float`, defaults to 1):
-  Control the strength of dilated sampling. For specific impacts, please refer to Appendix C in the DemoFusion paper.
-
- `cosine_scale_3` (`float`, defaults to 1):
-  Control the strength of the Gaussian filter. For specific impacts, please refer to Appendix C in the DemoFusion paper.
-
- `sigma` (`float`, defaults to 1):
-  The standard value of the Gaussian filter. Larger sigma promotes the global guidance of dilated sampling, but has the potential of over-smoothing.
-
- `multi_decoder` (`bool`, defaults to True):
-  Determine whether to use a tiled decoder. Generally, when the resolution exceeds 3072x3072, a tiled decoder becomes necessary.
-
- `show_image` (`bool`, defaults to False):
-  Determine whether to show intermediate results during generation.
-```
-from pipeline_demofusion_sdxl import DemoFusionSDXLPipeline
-
-model_ckpt = "stabilityai/stable-diffusion-xl-base-1.0"
-pipe = DemoFusionSDXLPipeline.from_pretrained(model_ckpt, torch_dtype=torch.float16)
-pipe = pipe.to("cuda")
-
-prompt = "Envision a portrait of an elderly woman, her face a canvas of time, framed by a headscarf with muted tones of rust and cream. Her eyes, blue like faded denim. Her attire, simple yet dignified."
-negative_prompt = "blurry, ugly, duplicate, poorly drawn, deformed, mosaic"
-
-images = pipe(
-    prompt, 
-    negative_prompt=negative_prompt,
-    height=3072, 
-    width=3072, 
-    view_batch_size=16, 
-    stride=64,
-    num_inference_steps=50, 
-    guidance_scale=7.5,
-    cosine_scale_1=3, 
-    cosine_scale_2=1, 
-    cosine_scale_3=1, 
-    sigma=0.8,
-    multi_decoder=True, 
-    show_image=True
-)
-```
-You can display and save the generated images as:
-```
-def image_grid(imgs, save_path=None):
-
-    w = 0
-    for i, img in enumerate(imgs):
-        h_, w_ = imgs[i].size
-        w += w_
-    h = h_
-    grid = Image.new('RGB', size=(w, h))
-    grid_w, grid_h = grid.size
-
-    w = 0
-    for i, img in enumerate(imgs):
-        h_, w_ = imgs[i].size
-        grid.paste(img, box=(w, h - h_))
-        if save_path != None:
-            img.save(save_path + "/img_{}.jpg".format((i + 1) * 1024))
-        w += w_
-        
-    return grid
-
-image_grid(images, save_path="./outputs/")
-```
- ![output_example](https://github.com/PRIS-CV/DemoFusion/blob/main/output_example.png)
@@ -16,7 +16,6 @@

 import ast
 import gc
-import inspect
 import math
 import warnings
 from collections.abc import Iterable
@@ -24,29 +23,16 @@ from typing import Any, Callable, Dict, List, Optional, Union

 import torch
 import torch.nn.functional as F
-from packaging import version
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

-from diffusers.configuration_utils import FrozenDict
-from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
-from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.models.attention import Attention, GatedSelfAttentionDense
 from diffusers.models.attention_processor import AttnProcessor2_0
-from diffusers.models.lora import adjust_lora_scale_text_encoder
-from diffusers.pipelines import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipeline
 from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
 from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import (
-    USE_PEFT_BACKEND,
-    deprecate,
-    logging,
-    replace_example_docstring,
-    scale_lora_layers,
-    unscale_lora_layers,
-)
-from diffusers.utils.torch_utils import randn_tensor
+from diffusers.utils import logging, replace_example_docstring


 EXAMPLE_DOC_STRING = """
@@ -58,7 +44,6 @@ EXAMPLE_DOC_STRING = """
        >>> pipe = DiffusionPipeline.from_pretrained(
        ...     "longlian/lmd_plus",
        ...     custom_pipeline="llm_grounded_diffusion",
-        ...     custom_revision="main",
        ...     variant="fp16", torch_dtype=torch.float16
        ... )
        >>> pipe.enable_model_cpu_offload()
@@ -111,12 +96,7 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

 # All keys in Stable Diffusion models: [('down', 0, 0, 0), ('down', 0, 1, 0), ('down', 1, 0, 0), ('down', 1, 1, 0), ('down', 2, 0, 0), ('down', 2, 1, 0), ('mid', 0, 0, 0), ('up', 1, 0, 0), ('up', 1, 1, 0), ('up', 1, 2, 0), ('up', 2, 0, 0), ('up', 2, 1, 0), ('up', 2, 2, 0), ('up', 3, 0, 0), ('up', 3, 1, 0), ('up', 3, 2, 0)]
 # Note that the first up block is `UpBlock2D` rather than `CrossAttnUpBlock2D` and does not have attention. The last index is always 0 in our case since we have one `BasicTransformerBlock` in each `Transformer2DModel`.
-DEFAULT_GUIDANCE_ATTN_KEYS = [
-    ("mid", 0, 0, 0),
-    ("up", 1, 0, 0),
-    ("up", 1, 1, 0),
-    ("up", 1, 2, 0),
-]
+DEFAULT_GUIDANCE_ATTN_KEYS = [("mid", 0, 0, 0), ("up", 1, 0, 0), ("up", 1, 1, 0), ("up", 1, 2, 0)]


 def convert_attn_keys(key):
@@ -146,15 +126,7 @@ def scale_proportion(obj_box, H, W):

 # Adapted from the parent class `AttnProcessor2_0`
 class AttnProcessorWithHook(AttnProcessor2_0):
-    def __init__(
-        self,
-        attn_processor_key,
-        hidden_size,
-        cross_attention_dim,
-        hook=None,
-        fast_attn=True,
-        enabled=True,
-    ):
+    def __init__(self, attn_processor_key, hidden_size, cross_attention_dim, hook=None, fast_attn=True, enabled=True):
        super().__init__()
        self.attn_processor_key = attn_processor_key
        self.hidden_size = hidden_size
@@ -193,16 +165,15 @@ class AttnProcessorWithHook(AttnProcessor2_0):
        if attn.group_norm is not None:
            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)

-        args = () if USE_PEFT_BACKEND else (scale,)
-        query = attn.to_q(hidden_states, *args)
+        query = attn.to_q(hidden_states, scale=scale)

        if encoder_hidden_states is None:
            encoder_hidden_states = hidden_states
        elif attn.norm_cross:
            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

-        key = attn.to_k(encoder_hidden_states, *args)
-        value = attn.to_v(encoder_hidden_states, *args)
+        key = attn.to_k(encoder_hidden_states, scale=scale)
+        value = attn.to_v(encoder_hidden_states, scale=scale)

        inner_dim = key.shape[-1]
        head_dim = inner_dim // attn.heads
@@ -215,13 +186,7 @@ class AttnProcessorWithHook(AttnProcessor2_0):

        if self.hook is not None and self.enabled:
            # Call the hook with query, key, value, and attention maps
-            self.hook(
-                self.attn_processor_key,
-                query_batch_dim,
-                key_batch_dim,
-                value_batch_dim,
-                attention_probs,
-            )
+            self.hook(self.attn_processor_key, query_batch_dim, key_batch_dim, value_batch_dim, attention_probs)

        if self.fast_attn:
            query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
@@ -237,12 +202,7 @@ class AttnProcessorWithHook(AttnProcessor2_0):
            # the output of sdp = (batch, num_heads, seq_len, head_dim)
            # TODO: add support for attn.scale when we move to Torch 2.1
            hidden_states = F.scaled_dot_product_attention(
-                query,
-                key,
-                value,
-                attn_mask=attention_mask,
-                dropout_p=0.0,
-                is_causal=False,
+                query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
            )
            hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
            hidden_states = hidden_states.to(query.dtype)
@@ -251,7 +211,7 @@ class AttnProcessorWithHook(AttnProcessor2_0):
            hidden_states = attn.batch_to_head_dim(hidden_states)

        # linear proj
-        hidden_states = attn.to_out[0](hidden_states, *args)
+        hidden_states = attn.to_out[0](hidden_states, scale=scale)
        # dropout
        hidden_states = attn.to_out[1](hidden_states)

@@ -266,9 +226,7 @@ class AttnProcessorWithHook(AttnProcessor2_0):
        return hidden_states


-class LLMGroundedDiffusionPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
-):
+class LLMGroundedDiffusionPipeline(StableDiffusionPipeline):
    r"""
    Pipeline for layout-grounded text-to-image generation using LLM-grounded Diffusion (LMD+): https://arxiv.org/pdf/2305.13655.pdf.

@@ -299,11 +257,6 @@ class LLMGroundedDiffusionPipeline(
            Whether a safety checker is needed for this pipeline.
    """

-    model_cpu_offload_seq = "text_encoder->unet->vae"
-    _optional_components = ["safety_checker", "feature_extractor", "image_encoder"]
-    _exclude_from_cpu_offload = ["safety_checker"]
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
-
    objects_text = "Objects: "
    bg_prompt_text = "Background prompt: "
    bg_prompt_text_no_trailing_space = bg_prompt_text.rstrip()
@@ -319,91 +272,12 @@ class LLMGroundedDiffusionPipeline(
        scheduler: KarrasDiffusionSchedulers,
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPImageProcessor,
-        image_encoder: CLIPVisionModelWithProjection = None,
        requires_safety_checker: bool = True,
    ):
-        # This is copied from StableDiffusionPipeline, with hook initizations for LMD+.
-        super().__init__()
-
-        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
-                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
-                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
-                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
-                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
-                " file"
-            )
-            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["steps_offset"] = 1
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
-            deprecation_message = (
-                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
-                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
-                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
-                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
-                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
-            )
-            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(scheduler.config)
-            new_config["clip_sample"] = False
-            scheduler._internal_dict = FrozenDict(new_config)
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
-            version.parse(unet.config._diffusers_version).base_version
-        ) < version.parse("0.9.0.dev0")
-        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
-        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
-            deprecation_message = (
-                "The configuration file of the unet has set the default `sample_size` to smaller than"
-                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
-                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
-                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
-                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
-                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
-                " in the config might lead to incorrect results in future versions. If you have downloaded this"
-                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
-                " the `unet/config.json` file"
-            )
-            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
-            new_config = dict(unet.config)
-            new_config["sample_size"] = 64
-            unet._internal_dict = FrozenDict(new_config)
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
+        super().__init__(
+            vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker
        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
-        self.register_to_config(requires_safety_checker=requires_safety_checker)

-        # Initialize the attention hooks for LLM-grounded Diffusion
        self.register_attn_hooks(unet)
        self._saved_attn = None

@@ -590,14 +464,7 @@ class LLMGroundedDiffusionPipeline(

        return token_map

-    def get_phrase_indices(
-        self,
-        prompt,
-        phrases,
-        token_map=None,
-        add_suffix_if_not_found=False,
-        verbose=False,
-    ):
+    def get_phrase_indices(self, prompt, phrases, token_map=None, add_suffix_if_not_found=False, verbose=False):
        for obj in phrases:
            # Suffix the prompt with object name for attention guidance if object is not in the prompt, using "|" to separate the prompt and the suffix
            if obj not in prompt:
@@ -618,14 +485,7 @@ class LLMGroundedDiffusionPipeline(
            phrase_token_map_str = " ".join(phrase_token_map)

            if verbose:
-                logger.info(
-                    "Full str:",
-                    token_map_str,
-                    "Substr:",
-                    phrase_token_map_str,
-                    "Phrase:",
-                    phrases,
-                )
+                logger.info("Full str:", token_map_str, "Substr:", phrase_token_map_str, "Phrase:", phrases)

            # Count the number of token before substr
            # The substring comes with a trailing space that needs to be removed by minus one in the index.
@@ -692,15 +552,7 @@ class LLMGroundedDiffusionPipeline(

        return loss

-    def compute_ca_loss(
-        self,
-        saved_attn,
-        bboxes,
-        phrase_indices,
-        guidance_attn_keys,
-        verbose=False,
-        **kwargs,
-    ):
+    def compute_ca_loss(self, saved_attn, bboxes, phrase_indices, guidance_attn_keys, verbose=False, **kwargs):
        """
        The `saved_attn` is supposed to be passed to `save_attn_to_dict` in `cross_attention_kwargs` prior to computing ths loss.
        `AttnProcessor` will put attention maps into the `save_attn_to_dict`.
@@ -753,7 +605,6 @@ class LLMGroundedDiffusionPipeline(
        latents: Optional[torch.FloatTensor] = None,
        prompt_embeds: Optional[torch.FloatTensor] = None,
        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        ip_adapter_image: Optional[PipelineImageInput] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
@@ -811,7 +662,6 @@ class LLMGroundedDiffusionPipeline(
            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
            output_type (`str`, *optional*, defaults to `"pil"`):
                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
            return_dict (`bool`, *optional*, defaults to `True`):
@@ -874,10 +724,9 @@ class LLMGroundedDiffusionPipeline(
                phrase_indices = []
                prompt_parsed = []
                for prompt_item in prompt:
-                    (
-                        phrase_indices_parsed_item,
-                        prompt_parsed_item,
-                    ) = self.get_phrase_indices(prompt_item, add_suffix_if_not_found=True)
+                    phrase_indices_parsed_item, prompt_parsed_item = self.get_phrase_indices(
+                        prompt_item, add_suffix_if_not_found=True
+                    )
                    phrase_indices.append(phrase_indices_parsed_item)
                    prompt_parsed.append(prompt_parsed_item)
                prompt = prompt_parsed
@@ -910,11 +759,6 @@ class LLMGroundedDiffusionPipeline(
        if do_classifier_free_guidance:
            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])

-        if ip_adapter_image is not None:
-            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
-            if self.do_classifier_free_guidance:
-                image_embeds = torch.cat([negative_image_embeds, image_embeds])
-
        # 4. Prepare timesteps
        self.scheduler.set_timesteps(num_inference_steps, device=device)
        timesteps = self.scheduler.timesteps
@@ -957,10 +801,7 @@ class LLMGroundedDiffusionPipeline(
        if n_objs:
            cond_boxes[:n_objs] = torch.tensor(boxes)
        text_embeddings = torch.zeros(
-            max_objs,
-            self.unet.config.cross_attention_dim,
-            device=device,
-            dtype=self.text_encoder.dtype,
+            max_objs, self.unet.config.cross_attention_dim, device=device, dtype=self.text_encoder.dtype
        )
        if n_objs:
            text_embeddings[:n_objs] = _text_embeddings
@@ -992,9 +833,6 @@ class LLMGroundedDiffusionPipeline(
        # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

-        # 6.1 Add image embeds for IP-Adapter
-        added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
-
        loss_attn = torch.tensor(10000.0)

        # 7. Denoising loop
@@ -1031,7 +869,6 @@ class LLMGroundedDiffusionPipeline(
                    t,
                    encoder_hidden_states=prompt_embeds,
                    cross_attention_kwargs=cross_attention_kwargs,
-                    added_cond_kwargs=added_cond_kwargs,
                ).sample

                # perform guidance
@@ -1176,438 +1013,3 @@ class LLMGroundedDiffusionPipeline(
                self.enable_attn_hook(enabled=False)

        return latents, loss
-
-    # Below are methods copied from StableDiffusionPipeline
-    # The design choice of not inheriting from StableDiffusionPipeline is discussed here: https://github.com/huggingface/diffusers/pull/5993#issuecomment-1834258517
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_slicing
-    def enable_vae_slicing(self):
-        r"""
-        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
-        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
-        """
-        self.vae.enable_slicing()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_slicing
-    def disable_vae_slicing(self):
-        r"""
-        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_slicing()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_vae_tiling
-    def enable_vae_tiling(self):
-        r"""
-        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
-        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
-        processing larger images.
-        """
-        self.vae.enable_tiling()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
-    def disable_vae_tiling(self):
-        r"""
-        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
-        computing decoding in one step.
-        """
-        self.vae.disable_tiling()
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        lora_scale: Optional[float] = None,
-        **kwargs,
-    ):
-        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
-        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
-
-        prompt_embeds_tuple = self.encode_prompt(
-            prompt=prompt,
-            device=device,
-            num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=lora_scale,
-            **kwargs,
-        )
-
-        # concatenate for backwards comp
-        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
-
-        return prompt_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt
-    def encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        lora_scale: Optional[float] = None,
-        clip_skip: Optional[int] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            lora_scale (`float`, *optional*):
-                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
-        """
-        # set lora scale so that monkey patched LoRA
-        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
-            self._lora_scale = lora_scale
-
-            # dynamically adjust the LoRA scale
-            if not USE_PEFT_BACKEND:
-                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
-            else:
-                scale_lora_layers(self.text_encoder, lora_scale)
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            if clip_skip is None:
-                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
-                prompt_embeds = prompt_embeds[0]
-            else:
-                prompt_embeds = self.text_encoder(
-                    text_input_ids.to(device),
-                    attention_mask=attention_mask,
-                    output_hidden_states=True,
-                )
-                # Access the `hidden_states` first, that contains a tuple of
-                # all the hidden states from the encoder layers. Then index into
-                # the tuple to access the hidden states from the desired layer.
-                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
-                # We also need to apply the final LayerNorm here to not mess with the
-                # representations. The `last_hidden_states` that we typically use for
-                # obtaining the final prompt representations passes through the LayerNorm
-                # layer.
-                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
-
-        if self.text_encoder is not None:
-            prompt_embeds_dtype = self.text_encoder.dtype
-        elif self.unet is not None:
-            prompt_embeds_dtype = self.unet.dtype
-        else:
-            prompt_embeds_dtype = prompt_embeds.dtype
-
-        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
-
-        return prompt_embeds, negative_prompt_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image
-    def encode_image(self, image, device, num_images_per_prompt):
-        dtype = next(self.image_encoder.parameters()).dtype
-
-        if not isinstance(image, torch.Tensor):
-            image = self.feature_extractor(image, return_tensors="pt").pixel_values
-
-        image = image.to(device=device, dtype=dtype)
-        image_embeds = self.image_encoder(image).image_embeds
-        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-
-        uncond_image_embeds = torch.zeros_like(image_embeds)
-        return image_embeds, uncond_image_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker
-    def run_safety_checker(self, image, device, dtype):
-        if self.safety_checker is None:
-            has_nsfw_concept = None
-        else:
-            if torch.is_tensor(image):
-                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
-            else:
-                feature_extractor_input = self.image_processor.numpy_to_pil(image)
-            safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device)
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
-            )
-        return image, has_nsfw_concept
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents
-    def decode_latents(self, latents):
-        deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead"
-        deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False)
-
-        latents = 1 / self.vae.config.scaling_factor * latents
-        image = self.vae.decode(latents, return_dict=False)[0]
-        image = (image / 2 + 0.5).clamp(0, 1)
-        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16
-        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
-        return image
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents
-    def prepare_latents(
-        self,
-        batch_size,
-        num_channels_latents,
-        height,
-        width,
-        dtype,
-        device,
-        generator,
-        latents=None,
-    ):
-        shape = (
-            batch_size,
-            num_channels_latents,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_freeu
-    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
-        r"""Enables the FreeU mechanism as in https://arxiv.org/abs/2309.11497.
-
-        The suffixes after the scaling factors represent the stages where they are being applied.
-
-        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of the values
-        that are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
-
-        Args:
-            s1 (`float`):
-                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
-                mitigate "oversmoothing effect" in the enhanced denoising process.
-            s2 (`float`):
-                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
-                mitigate "oversmoothing effect" in the enhanced denoising process.
-            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
-            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
-        """
-        if not hasattr(self, "unet"):
-            raise ValueError("The pipeline must have `unet` for using FreeU.")
-        self.unet.enable_freeu(s1=s1, s2=s2, b1=b1, b2=b2)
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
-    def disable_freeu(self):
-        """Disables the FreeU mechanism if enabled."""
-        self.unet.disable_freeu()
-
-    # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding
-    def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32):
-        """
-        See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298
-
-        Args:
-            timesteps (`torch.Tensor`):
-                generate embedding vectors at these timesteps
-            embedding_dim (`int`, *optional*, defaults to 512):
-                dimension of the embeddings to generate
-            dtype:
-                data type of the generated embeddings
-
-        Returns:
-            `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)`
-        """
-        assert len(w.shape) == 1
-        w = w * 1000.0
-
-        half_dim = embedding_dim // 2
-        emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1)
-        emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb)
-        emb = w.to(dtype)[:, None] * emb[None, :]
-        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1)
-        if embedding_dim % 2 == 1:  # zero pad
-            emb = torch.nn.functional.pad(emb, (0, 1))
-        assert emb.shape == (w.shape[0], embedding_dim)
-        return emb
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.guidance_scale
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.guidance_rescale
-    @property
-    def guidance_rescale(self):
-        return self._guidance_rescale
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.clip_skip
-    @property
-    def clip_skip(self):
-        return self._clip_skip
-
-    # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-    # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-    # corresponds to doing no classifier free guidance.
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.do_classifier_free_guidance
-    @property
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.cross_attention_kwargs
-    @property
-    def cross_attention_kwargs(self):
-        return self._cross_attention_kwargs
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.num_timesteps
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
@@ -1,772 +0,0 @@
-# Copyright 2023 The Intel Labs Team Authors and the HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Any, Callable, Dict, List, Optional, Union
-
-import numpy as np
-import PIL
-import torch
-from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
-
-from diffusers import DiffusionPipeline
-from diffusers.image_processor import PipelineDepthInput, PipelineImageInput, VaeImageProcessorLDM3D
-from diffusers.loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.models.lora import adjust_lora_scale_text_encoder
-from diffusers.pipelines.stable_diffusion import StableDiffusionSafetyChecker
-from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d import LDM3DPipelineOutput
-from diffusers.schedulers import DDPMScheduler, KarrasDiffusionSchedulers
-from diffusers.utils import (
-    USE_PEFT_BACKEND,
-    deprecate,
-    logging,
-    scale_lora_layers,
-    unscale_lora_layers,
-)
-from diffusers.utils.torch_utils import randn_tensor
-
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```python
-        >>> from diffusers import StableDiffusionUpscaleLDM3DPipeline
-        >>> from PIL import Image
-        >>> from io import BytesIO
-        >>> import requests
-
-        >>> pipe = StableDiffusionUpscaleLDM3DPipeline.from_pretrained("Intel/ldm3d-sr")
-        >>> pipe = pipe.to("cuda")
-        >>> rgb_path = "https://huggingface.co/Intel/ldm3d-sr/resolve/main/lemons_ldm3d_rgb.jpg"
-        >>> depth_path = "https://huggingface.co/Intel/ldm3d-sr/resolve/main/lemons_ldm3d_depth.png"
-        >>> low_res_rgb = Image.open(BytesIO(requests.get(rgb_path).content)).convert("RGB")
-        >>> low_res_depth = Image.open(BytesIO(requests.get(depth_path).content)).convert("L")
-        >>> output = pipe(
-        ...     prompt="high quality high resolution uhd 4k image",
-        ...     rgb=low_res_rgb,
-        ...     depth=low_res_depth,
-        ...     num_inference_steps=50,
-        ...     target_res=[1024, 1024],
-        ... )
-        >>> rgb_image, depth_image = output.rgb, output.depth
-        >>> rgb_image[0].save("hr_ldm3d_rgb.jpg")
-        >>> depth_image[0].save("hr_ldm3d_depth.png")
-        ```
-"""
-
-
-class StableDiffusionUpscaleLDM3DPipeline(
-    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin
-):
-    r"""
-    Pipeline for text-to-image and 3D generation using LDM3D.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
-    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
-
-    The pipeline also inherits the following loading methods:
-        - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings
-        - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights
-        - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights
-        - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations.
-        text_encoder ([`~transformers.CLIPTextModel`]):
-            Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)).
-        tokenizer ([`~transformers.CLIPTokenizer`]):
-            A `CLIPTokenizer` to tokenize text.
-        unet ([`UNet2DConditionModel`]):
-            A `UNet2DConditionModel` to denoise the encoded image latents.
-        low_res_scheduler ([`SchedulerMixin`]):
-            A scheduler used to add initial noise to the low resolution conditioning image. It must be an instance of
-            [`DDPMScheduler`].
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
-            about a model's potential harms.
-        feature_extractor ([`~transformers.CLIPImageProcessor`]):
-            A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
-    """
-
-    _optional_components = ["safety_checker", "feature_extractor"]
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        low_res_scheduler: DDPMScheduler,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPImageProcessor,
-        requires_safety_checker: bool = True,
-        watermarker: Optional[Any] = None,
-        max_noise_level: int = 350,
-    ):
-        super().__init__()
-
-        if safety_checker is None and requires_safety_checker:
-            logger.warning(
-                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
-                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
-                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
-                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
-                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
-                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
-            )
-
-        if safety_checker is not None and feature_extractor is None:
-            raise ValueError(
-                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
-                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
-            )
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            low_res_scheduler=low_res_scheduler,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            watermarker=watermarker,
-            feature_extractor=feature_extractor,
-        )
-        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
-        self.image_processor = VaeImageProcessorLDM3D(vae_scale_factor=self.vae_scale_factor, resample="bilinear")
-        # self.register_to_config(requires_safety_checker=requires_safety_checker)
-        self.register_to_config(max_noise_level=max_noise_level)
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d.StableDiffusionLDM3DPipeline._encode_prompt
-    def _encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        lora_scale: Optional[float] = None,
-        **kwargs,
-    ):
-        deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple."
-        deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False)
-
-        prompt_embeds_tuple = self.encode_prompt(
-            prompt=prompt,
-            device=device,
-            num_images_per_prompt=num_images_per_prompt,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            negative_prompt=negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            lora_scale=lora_scale,
-            **kwargs,
-        )
-
-        # concatenate for backwards comp
-        prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]])
-
-        return prompt_embeds
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_ldm3d.StableDiffusionLDM3DPipeline.encode_prompt
-    def encode_prompt(
-        self,
-        prompt,
-        device,
-        num_images_per_prompt,
-        do_classifier_free_guidance,
-        negative_prompt=None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        lora_scale: Optional[float] = None,
-        clip_skip: Optional[int] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            device: (`torch.device`):
-                torch device
-            num_images_per_prompt (`int`):
-                number of images that should be generated per prompt
-            do_classifier_free_guidance (`bool`):
-                whether to use classifier free guidance or not
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            lora_scale (`float`, *optional*):
-                A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
-        """
-        # set lora scale so that monkey patched LoRA
-        # function of text encoder can correctly access it
-        if lora_scale is not None and isinstance(self, LoraLoaderMixin):
-            self._lora_scale = lora_scale
-
-            # dynamically adjust the LoRA scale
-            if not USE_PEFT_BACKEND:
-                adjust_lora_scale_text_encoder(self.text_encoder, lora_scale)
-            else:
-                scale_lora_layers(self.text_encoder, lora_scale)
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                prompt = self.maybe_convert_prompt(prompt, self.tokenizer)
-
-            text_inputs = self.tokenizer(
-                prompt,
-                padding="max_length",
-                max_length=self.tokenizer.model_max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-            text_input_ids = text_inputs.input_ids
-            untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-
-            if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(
-                text_input_ids, untruncated_ids
-            ):
-                removed_text = self.tokenizer.batch_decode(
-                    untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1]
-                )
-                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
-                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
-                )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = text_inputs.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            if clip_skip is None:
-                prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask)
-                prompt_embeds = prompt_embeds[0]
-            else:
-                prompt_embeds = self.text_encoder(
-                    text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True
-                )
-                # Access the `hidden_states` first, that contains a tuple of
-                # all the hidden states from the encoder layers. Then index into
-                # the tuple to access the hidden states from the desired layer.
-                prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)]
-                # We also need to apply the final LayerNorm here to not mess with the
-                # representations. The `last_hidden_states` that we typically use for
-                # obtaining the final prompt representations passes through the LayerNorm
-                # layer.
-                prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds)
-
-        if self.text_encoder is not None:
-            prompt_embeds_dtype = self.text_encoder.dtype
-        elif self.unet is not None:
-            prompt_embeds_dtype = self.unet.dtype
-        else:
-            prompt_embeds_dtype = prompt_embeds.dtype
-
-        prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-
-        bs_embed, seq_len, _ = prompt_embeds.shape
-        # duplicate text embeddings for each generation per prompt, using mps friendly method
-        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
-        prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1)
-
-        # get unconditional embeddings for classifier free guidance
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            uncond_tokens: List[str]
-            if negative_prompt is None:
-                uncond_tokens = [""] * batch_size
-            elif prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif isinstance(negative_prompt, str):
-                uncond_tokens = [negative_prompt]
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-            else:
-                uncond_tokens = negative_prompt
-
-            # textual inversion: procecss multi-vector tokens if necessary
-            if isinstance(self, TextualInversionLoaderMixin):
-                uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer)
-
-            max_length = prompt_embeds.shape[1]
-            uncond_input = self.tokenizer(
-                uncond_tokens,
-                padding="max_length",
-                max_length=max_length,
-                truncation=True,
-                return_tensors="pt",
-            )
-
-            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
-                attention_mask = uncond_input.attention_mask.to(device)
-            else:
-                attention_mask = None
-
-            negative_prompt_embeds = self.text_encoder(
-                uncond_input.input_ids.to(device),
-                attention_mask=attention_mask,
-            )
-            negative_prompt_embeds = negative_prompt_embeds[0]
-
-        if do_classifier_free_guidance:
-            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
-            seq_len = negative_prompt_embeds.shape[1]
-
-            negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device)
-
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
-
-        if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND:
-            # Retrieve the original scale by scaling back the LoRA layers
-            unscale_lora_layers(self.text_encoder, lora_scale)
-
-        return prompt_embeds, negative_prompt_embeds
-
-    def run_safety_checker(self, image, device, dtype):
-        if self.safety_checker is None:
-            has_nsfw_concept = None
-        else:
-            if torch.is_tensor(image):
-                feature_extractor_input = self.image_processor.postprocess(image, output_type="pil")
-            else:
-                feature_extractor_input = self.image_processor.numpy_to_pil(image)
-            rgb_feature_extractor_input = feature_extractor_input[0]
-            safety_checker_input = self.feature_extractor(rgb_feature_extractor_input, return_tensors="pt").to(device)
-            image, has_nsfw_concept = self.safety_checker(
-                images=image, clip_input=safety_checker_input.pixel_values.to(dtype)
-            )
-        return image, has_nsfw_concept
-
-    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs
-    def prepare_extra_step_kwargs(self, generator, eta):
-        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
-        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
-        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
-        # and should be between [0, 1]
-
-        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        extra_step_kwargs = {}
-        if accepts_eta:
-            extra_step_kwargs["eta"] = eta
-
-        # check if the scheduler accepts generator
-        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
-        if accepts_generator:
-            extra_step_kwargs["generator"] = generator
-        return extra_step_kwargs
-
-    def check_inputs(
-        self,
-        prompt,
-        image,
-        noise_level,
-        callback_steps,
-        negative_prompt=None,
-        prompt_embeds=None,
-        negative_prompt_embeds=None,
-        target_res=None,
-    ):
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if negative_prompt is not None and negative_prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
-                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
-            )
-
-        if prompt_embeds is not None and negative_prompt_embeds is not None:
-            if prompt_embeds.shape != negative_prompt_embeds.shape:
-                raise ValueError(
-                    "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
-                    f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
-                    f" {negative_prompt_embeds.shape}."
-                )
-
-        if (
-            not isinstance(image, torch.Tensor)
-            and not isinstance(image, PIL.Image.Image)
-            and not isinstance(image, np.ndarray)
-            and not isinstance(image, list)
-        ):
-            raise ValueError(
-                f"`image` has to be of type `torch.Tensor`, `np.ndarray`, `PIL.Image.Image` or `list` but is {type(image)}"
-            )
-
-        # verify batch size of prompt and image are same if image is a list or tensor or numpy array
-        if isinstance(image, list) or isinstance(image, torch.Tensor) or isinstance(image, np.ndarray):
-            if prompt is not None and isinstance(prompt, str):
-                batch_size = 1
-            elif prompt is not None and isinstance(prompt, list):
-                batch_size = len(prompt)
-            else:
-                batch_size = prompt_embeds.shape[0]
-
-            if isinstance(image, list):
-                image_batch_size = len(image)
-            else:
-                image_batch_size = image.shape[0]
-            if batch_size != image_batch_size:
-                raise ValueError(
-                    f"`prompt` has batch size {batch_size} and `image` has batch size {image_batch_size}."
-                    " Please make sure that passed `prompt` matches the batch size of `image`."
-                )
-
-        # check noise level
-        if noise_level > self.config.max_noise_level:
-            raise ValueError(f"`noise_level` has to be <= {self.config.max_noise_level} but is {noise_level}")
-
-        if (callback_steps is None) or (
-            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
-        ):
-            raise ValueError(
-                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
-                f" {type(callback_steps)}."
-            )
-
-    def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None):
-        shape = (batch_size, num_channels_latents, height, width)
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            if latents.shape != shape:
-                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
-            latents = latents.to(device)
-
-        # scale the initial noise by the standard deviation required by the scheduler
-        latents = latents * self.scheduler.init_noise_sigma
-        return latents
-
-    # def upcast_vae(self):
-    #     dtype = self.vae.dtype
-    #     self.vae.to(dtype=torch.float32)
-    #     use_torch_2_0_or_xformers = isinstance(
-    #         self.vae.decoder.mid_block.attentions[0].processor,
-    #         (
-    #             AttnProcessor2_0,
-    #             XFormersAttnProcessor,
-    #             LoRAXFormersAttnProcessor,
-    #             LoRAAttnProcessor2_0,
-    #         ),
-    #     )
-    #     # if xformers or torch_2_0 is used attention block does not need
-    #     # to be in float32 which can save lots of memory
-    #     if use_torch_2_0_or_xformers:
-    #         self.vae.post_quant_conv.to(dtype)
-    #         self.vae.decoder.conv_in.to(dtype)
-    #         self.vae.decoder.mid_block.to(dtype)
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        rgb: PipelineImageInput = None,
-        depth: PipelineDepthInput = None,
-        num_inference_steps: int = 75,
-        guidance_scale: float = 9.0,
-        noise_level: int = 20,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        prompt_embeds: Optional[torch.FloatTensor] = None,
-        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
-        callback_steps: int = 1,
-        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-        target_res: Optional[List[int]] = [1024, 1024],
-    ):
-        r"""
-        The call function to the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
-            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
-                `Image` or tensor representing an image batch to be upscaled.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, *optional*, defaults to 5.0):
-                A higher guidance scale value encourages the model to generate images closely linked to the text
-                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
-                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
-                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
-                generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor is generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
-                provided, text embeddings are generated from the `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
-                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
-                plain tuple.
-            callback (`Callable`, *optional*):
-                A function that calls every `callback_steps` steps during inference. The function is called with the
-                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
-            callback_steps (`int`, *optional*, defaults to 1):
-                The frequency at which the `callback` function is called. If not specified, the callback is called at
-                every step.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
-                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
-                otherwise a `tuple` is returned where the first element is a list with the generated images and the
-                second element is a list of `bool`s indicating whether the corresponding generated image contains
-                "not-safe-for-work" (nsfw) content.
-        """
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(
-            prompt,
-            rgb,
-            noise_level,
-            callback_steps,
-            negative_prompt,
-            prompt_embeds,
-            negative_prompt_embeds,
-        )
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        device = self._execution_device
-        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
-        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
-        # corresponds to doing no classifier free guidance.
-        do_classifier_free_guidance = guidance_scale > 1.0
-
-        # 3. Encode input prompt
-        prompt_embeds, negative_prompt_embeds = self.encode_prompt(
-            prompt,
-            device,
-            num_images_per_prompt,
-            do_classifier_free_guidance,
-            negative_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-        )
-        # For classifier free guidance, we need to do two forward passes.
-        # Here we concatenate the unconditional and text embeddings into a single batch
-        # to avoid doing two forward passes
-        if do_classifier_free_guidance:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-
-        # 4. Preprocess image
-        rgb, depth = self.image_processor.preprocess(rgb, depth, target_res=target_res)
-        rgb = rgb.to(dtype=prompt_embeds.dtype, device=device)
-        depth = depth.to(dtype=prompt_embeds.dtype, device=device)
-
-        # 5. set timesteps
-        self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps = self.scheduler.timesteps
-
-        # 6. Encode low resolutiom image to latent space
-        image = torch.cat([rgb, depth], axis=1)
-        latent_space_image = self.vae.encode(image).latent_dist.sample(generator)
-        latent_space_image *= self.vae.scaling_factor
-        noise_level = torch.tensor([noise_level], dtype=torch.long, device=device)
-        # noise_rgb = randn_tensor(rgb.shape, generator=generator, device=device, dtype=prompt_embeds.dtype)
-        # rgb = self.low_res_scheduler.add_noise(rgb, noise_rgb, noise_level)
-        # noise_depth = randn_tensor(depth.shape, generator=generator, device=device, dtype=prompt_embeds.dtype)
-        # depth = self.low_res_scheduler.add_noise(depth, noise_depth, noise_level)
-
-        batch_multiplier = 2 if do_classifier_free_guidance else 1
-        latent_space_image = torch.cat([latent_space_image] * batch_multiplier * num_images_per_prompt)
-        noise_level = torch.cat([noise_level] * latent_space_image.shape[0])
-
-        # 7. Prepare latent variables
-        height, width = latent_space_image.shape[2:]
-        num_channels_latents = self.vae.config.latent_channels
-
-        latents = self.prepare_latents(
-            batch_size * num_images_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            prompt_embeds.dtype,
-            device,
-            generator,
-            latents,
-        )
-
-        # 8. Check that sizes of image and latents match
-        num_channels_image = latent_space_image.shape[1]
-        if num_channels_latents + num_channels_image != self.unet.config.in_channels:
-            raise ValueError(
-                f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects"
-                f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +"
-                f" `num_channels_image`: {num_channels_image} "
-                f" = {num_channels_latents+num_channels_image}. Please verify the config of"
-                " `pipeline.unet` or your `image` input."
-            )
-
-        # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
-        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
-
-        # 10. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-
-                # concat latents, mask, masked_image_latents in the channel dimension
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                latent_model_input = torch.cat([latent_model_input, latent_space_image], dim=1)
-
-                # predict the noise residual
-                noise_pred = self.unet(
-                    latent_model_input,
-                    t,
-                    encoder_hidden_states=prompt_embeds,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    class_labels=noise_level,
-                    return_dict=False,
-                )[0]
-
-                # perform guidance
-                if do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
-
-                # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-                    if callback is not None and i % callback_steps == 0:
-                        callback(i, t, latents)
-
-        if not output_type == "latent":
-            # make sure the VAE is in float32 mode, as it overflows in float16
-            needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
-
-            if needs_upcasting:
-                self.upcast_vae()
-                latents = latents.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
-
-            image = self.vae.decode(latents / self.vae.scaling_factor, return_dict=False)[0]
-
-            # cast back to fp16 if needed
-            if needs_upcasting:
-                self.vae.to(dtype=torch.float16)
-
-            image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
-
-        else:
-            image = latents
-            has_nsfw_concept = None
-
-        if has_nsfw_concept is None:
-            do_denormalize = [True] * image.shape[0]
-        else:
-            do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
-
-        rgb, depth = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
-
-        # 11. Apply watermark
-        if output_type == "pil" and self.watermarker is not None:
-            rgb = self.watermarker.apply_watermark(rgb)
-
-        # Offload last model to CPU
-        if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None:
-            self.final_offload_hook.offload()
-
-        if not return_dict:
-            return ((rgb, depth), has_nsfw_concept)
-
-        return LDM3DPipelineOutput(rgb=rgb, depth=depth, nsfw_content_detected=has_nsfw_concept)
@@ -1470,15 +1470,7 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline(DiffusionPipeline, FromS
        height, width = self._default_height_width(height, width, adapter_image)
        device = self._execution_device

-        if isinstance(adapter, MultiAdapter):
-            adapter_input = []
-            for one_image in adapter_image:
-                one_image = _preprocess_adapter_image(one_image, height, width)
-                one_image = one_image.to(device=device, dtype=adapter.dtype)
-                adapter_input.append(one_image)
-        else:
-            adapter_input = _preprocess_adapter_image(adapter_image, height, width)
-            adapter_input = adapter_input.to(device=device, dtype=adapter.dtype)
+        adapter_input = _preprocess_adapter_image(adapter_image, height, width).to(device)

        original_size = original_size or (height, width)
        target_size = target_size or (height, width)
@@ -1651,14 +1643,10 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline(DiffusionPipeline, FromS
        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)

        # 10. Prepare added time ids & embeddings & adapter features
-        if isinstance(adapter, MultiAdapter):
-            adapter_state = adapter(adapter_input, adapter_conditioning_scale)
-            for k, v in enumerate(adapter_state):
-                adapter_state[k] = v
-        else:
-            adapter_state = adapter(adapter_input)
-            for k, v in enumerate(adapter_state):
-                adapter_state[k] = v * adapter_conditioning_scale
+        adapter_input = adapter_input.type(latents.dtype)
+        adapter_state = adapter(adapter_input)
+        for k, v in enumerate(adapter_state):
+            adapter_state[k] = v * adapter_conditioning_scale
        if num_images_per_prompt > 1:
            for k, v in enumerate(adapter_state):
                adapter_state[k] = v.repeat(num_images_per_prompt, 1, 1, 1)
@@ -1,589 +0,0 @@
-import math
-from typing import Dict, Optional
-
-import torch
-import torchvision.transforms.functional as FF
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
-
-from diffusers import StableDiffusionPipeline
-from diffusers.models import AutoencoderKL, UNet2DConditionModel
-from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from diffusers.schedulers import KarrasDiffusionSchedulers
-from diffusers.utils import USE_PEFT_BACKEND
-
-
-try:
-    from compel import Compel
-except ImportError:
-    Compel = None
-
-KCOMM = "ADDCOMM"
-KBRK = "BREAK"
-
-
-class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
-    r"""
-    Args for Regional Prompting Pipeline:
-        rp_args:dict
-        Required
-            rp_args["mode"]: cols, rows, prompt, prompt-ex
-        for cols, rows mode
-            rp_args["div"]: ex) 1;1;1(Divide into 3 regions)
-        for prompt, prompt-ex mode
-            rp_args["th"]: ex) 0.5,0.5,0.6 (threshold for prompt mode)
-
-        Optional
-            rp_args["save_mask"]: True/False (save masks in prompt mode)
-
-    Pipeline for text-to-image generation using Stable Diffusion.
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
-    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
-
-    Args:
-        vae ([`AutoencoderKL`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
-        text_encoder ([`CLIPTextModel`]):
-            Frozen text-encoder. Stable Diffusion uses the text portion of
-            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
-            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
-        tokenizer (`CLIPTokenizer`):
-            Tokenizer of class
-            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
-        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
-        scheduler ([`SchedulerMixin`]):
-            A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of
-            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
-        safety_checker ([`StableDiffusionSafetyChecker`]):
-            Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
-        feature_extractor ([`CLIPImageProcessor`]):
-            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
-    """
-
-    def __init__(
-        self,
-        vae: AutoencoderKL,
-        text_encoder: CLIPTextModel,
-        tokenizer: CLIPTokenizer,
-        unet: UNet2DConditionModel,
-        scheduler: KarrasDiffusionSchedulers,
-        safety_checker: StableDiffusionSafetyChecker,
-        feature_extractor: CLIPFeatureExtractor,
-        requires_safety_checker: bool = True,
-    ):
-        super().__init__(
-            vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker
-        )
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            unet=unet,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-        )
-
-    @torch.no_grad()
-    def __call__(
-        self,
-        prompt: str,
-        height: int = 512,
-        width: int = 512,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        negative_prompt: str = None,
-        num_images_per_prompt: Optional[int] = 1,
-        eta: float = 0.0,
-        generator: Optional[torch.Generator] = None,
-        latents: Optional[torch.FloatTensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        rp_args: Dict[str, str] = None,
-    ):
-        active = KBRK in prompt[0] if type(prompt) == list else KBRK in prompt  # noqa: E721
-        if negative_prompt is None:
-            negative_prompt = "" if type(prompt) == str else [""] * len(prompt)  # noqa: E721
-
-        device = self._execution_device
-        regions = 0
-
-        self.power = int(rp_args["power"]) if "power" in rp_args else 1
-
-        prompts = prompt if type(prompt) == list else [prompt]  # noqa: E721
-        n_prompts = negative_prompt if type(negative_prompt) == list else [negative_prompt]  # noqa: E721
-        self.batch = batch = num_images_per_prompt * len(prompts)
-        all_prompts_cn, all_prompts_p = promptsmaker(prompts, num_images_per_prompt)
-        all_n_prompts_cn, _ = promptsmaker(n_prompts, num_images_per_prompt)
-
-        cn = len(all_prompts_cn) == len(all_n_prompts_cn)
-
-        if Compel:
-            compel = Compel(tokenizer=self.tokenizer, text_encoder=self.text_encoder)
-
-            def getcompelembs(prps):
-                embl = []
-                for prp in prps:
-                    embl.append(compel.build_conditioning_tensor(prp))
-                return torch.cat(embl)
-
-            conds = getcompelembs(all_prompts_cn)
-            unconds = getcompelembs(all_n_prompts_cn) if cn else getcompelembs(n_prompts)
-            embs = getcompelembs(prompts)
-            n_embs = getcompelembs(n_prompts)
-            prompt = negative_prompt = None
-        else:
-            conds = self.encode_prompt(prompts, device, 1, True)[0]
-            unconds = (
-                self.encode_prompt(n_prompts, device, 1, True)[0]
-                if cn
-                else self.encode_prompt(all_n_prompts_cn, device, 1, True)[0]
-            )
-            embs = n_embs = None
-
-        if not active:
-            pcallback = None
-            mode = None
-        else:
-            if any(x in rp_args["mode"].upper() for x in ["COL", "ROW"]):
-                mode = "COL" if "COL" in rp_args["mode"].upper() else "ROW"
-                ocells, icells, regions = make_cells(rp_args["div"])
-
-            elif "PRO" in rp_args["mode"].upper():
-                regions = len(all_prompts_p[0])
-                mode = "PROMPT"
-                reset_attnmaps(self)
-                self.ex = "EX" in rp_args["mode"].upper()
-                self.target_tokens = target_tokens = tokendealer(self, all_prompts_p)
-                thresholds = [float(x) for x in rp_args["th"].split(",")]
-
-            orig_hw = (height, width)
-            revers = True
-
-            def pcallback(s_self, step: int, timestep: int, latents: torch.FloatTensor, selfs=None):
-                if "PRO" in mode:  # in Prompt mode, make masks from sum of attension maps
-                    self.step = step
-
-                    if len(self.attnmaps_sizes) > 3:
-                        self.history[step] = self.attnmaps.copy()
-                        for hw in self.attnmaps_sizes:
-                            allmasks = []
-                            basemasks = [None] * batch
-                            for tt, th in zip(target_tokens, thresholds):
-                                for b in range(batch):
-                                    key = f"{tt}-{b}"
-                                    _, mask, _ = makepmask(self, self.attnmaps[key], hw[0], hw[1], th, step)
-                                    mask = mask.unsqueeze(0).unsqueeze(-1)
-                                    if self.ex:
-                                        allmasks[b::batch] = [x - mask for x in allmasks[b::batch]]
-                                        allmasks[b::batch] = [torch.where(x > 0, 1, 0) for x in allmasks[b::batch]]
-                                    allmasks.append(mask)
-                                    basemasks[b] = mask if basemasks[b] is None else basemasks[b] + mask
-                            basemasks = [1 - mask for mask in basemasks]
-                            basemasks = [torch.where(x > 0, 1, 0) for x in basemasks]
-                            allmasks = basemasks + allmasks
-
-                            self.attnmasks[hw] = torch.cat(allmasks)
-                        self.maskready = True
-                return latents
-
-            def hook_forward(module):
-                # diffusers==0.23.2
-                def forward(
-                    hidden_states: torch.FloatTensor,
-                    encoder_hidden_states: Optional[torch.FloatTensor] = None,
-                    attention_mask: Optional[torch.FloatTensor] = None,
-                    temb: Optional[torch.FloatTensor] = None,
-                    scale: float = 1.0,
-                ) -> torch.Tensor:
-                    attn = module
-                    xshape = hidden_states.shape
-                    self.hw = (h, w) = split_dims(xshape[1], *orig_hw)
-
-                    if revers:
-                        nx, px = hidden_states.chunk(2)
-                    else:
-                        px, nx = hidden_states.chunk(2)
-
-                    if cn:
-                        hidden_states = torch.cat([px for i in range(regions)] + [nx for i in range(regions)], 0)
-                        encoder_hidden_states = torch.cat([conds] + [unconds])
-                    else:
-                        hidden_states = torch.cat([px for i in range(regions)] + [nx], 0)
-                        encoder_hidden_states = torch.cat([conds] + [unconds])
-
-                    residual = hidden_states
-
-                    args = () if USE_PEFT_BACKEND else (scale,)
-
-                    if attn.spatial_norm is not None:
-                        hidden_states = attn.spatial_norm(hidden_states, temb)
-
-                    input_ndim = hidden_states.ndim
-
-                    if input_ndim == 4:
-                        batch_size, channel, height, width = hidden_states.shape
-                        hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
-
-                    batch_size, sequence_length, _ = (
-                        hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
-                    )
-
-                    if attention_mask is not None:
-                        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-                        attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
-
-                    if attn.group_norm is not None:
-                        hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
-
-                    args = () if USE_PEFT_BACKEND else (scale,)
-                    query = attn.to_q(hidden_states, *args)
-
-                    if encoder_hidden_states is None:
-                        encoder_hidden_states = hidden_states
-                    elif attn.norm_cross:
-                        encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
-
-                    key = attn.to_k(encoder_hidden_states, *args)
-                    value = attn.to_v(encoder_hidden_states, *args)
-
-                    inner_dim = key.shape[-1]
-                    head_dim = inner_dim // attn.heads
-
-                    query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-                    key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-                    value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
-
-                    # the output of sdp = (batch, num_heads, seq_len, head_dim)
-                    # TODO: add support for attn.scale when we move to Torch 2.1
-                    hidden_states = scaled_dot_product_attention(
-                        self,
-                        query,
-                        key,
-                        value,
-                        attn_mask=attention_mask,
-                        dropout_p=0.0,
-                        is_causal=False,
-                        getattn="PRO" in mode,
-                    )
-
-                    hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
-                    hidden_states = hidden_states.to(query.dtype)
-
-                    # linear proj
-                    hidden_states = attn.to_out[0](hidden_states, *args)
-                    # dropout
-                    hidden_states = attn.to_out[1](hidden_states)
-
-                    if input_ndim == 4:
-                        hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
-
-                    if attn.residual_connection:
-                        hidden_states = hidden_states + residual
-
-                    hidden_states = hidden_states / attn.rescale_output_factor
-
-                    #### Regional Prompting Col/Row mode
-                    if any(x in mode for x in ["COL", "ROW"]):
-                        reshaped = hidden_states.reshape(hidden_states.size()[0], h, w, hidden_states.size()[2])
-                        center = reshaped.shape[0] // 2
-                        px = reshaped[0:center] if cn else reshaped[0:-batch]
-                        nx = reshaped[center:] if cn else reshaped[-batch:]
-                        outs = [px, nx] if cn else [px]
-                        for out in outs:
-                            c = 0
-                            for i, ocell in enumerate(ocells):
-                                for icell in icells[i]:
-                                    if "ROW" in mode:
-                                        out[
-                                            0:batch,
-                                            int(h * ocell[0]) : int(h * ocell[1]),
-                                            int(w * icell[0]) : int(w * icell[1]),
-                                            :,
-                                        ] = out[
-                                            c * batch : (c + 1) * batch,
-                                            int(h * ocell[0]) : int(h * ocell[1]),
-                                            int(w * icell[0]) : int(w * icell[1]),
-                                            :,
-                                        ]
-                                    else:
-                                        out[
-                                            0:batch,
-                                            int(h * icell[0]) : int(h * icell[1]),
-                                            int(w * ocell[0]) : int(w * ocell[1]),
-                                            :,
-                                        ] = out[
-                                            c * batch : (c + 1) * batch,
-                                            int(h * icell[0]) : int(h * icell[1]),
-                                            int(w * ocell[0]) : int(w * ocell[1]),
-                                            :,
-                                        ]
-                                    c += 1
-                        px, nx = (px[0:batch], nx[0:batch]) if cn else (px[0:batch], nx)
-                        hidden_states = torch.cat([nx, px], 0) if revers else torch.cat([px, nx], 0)
-                        hidden_states = hidden_states.reshape(xshape)
-
-                    #### Regional Prompting Prompt mode
-                    elif "PRO" in mode:
-                        center = reshaped.shape[0] // 2
-                        px = reshaped[0:center] if cn else reshaped[0:-batch]
-                        nx = reshaped[center:] if cn else reshaped[-batch:]
-
-                        if (h, w) in self.attnmasks and self.maskready:
-
-                            def mask(input):
-                                out = torch.multiply(input, self.attnmasks[(h, w)])
-                                for b in range(batch):
-                                    for r in range(1, regions):
-                                        out[b] = out[b] + out[r * batch + b]
-                                return out
-
-                            px, nx = (mask(px), mask(nx)) if cn else (mask(px), nx)
-                        px, nx = (px[0:batch], nx[0:batch]) if cn else (px[0:batch], nx)
-                        hidden_states = torch.cat([nx, px], 0) if revers else torch.cat([px, nx], 0)
-                    return hidden_states
-
-                return forward
-
-            def hook_forwards(root_module: torch.nn.Module):
-                for name, module in root_module.named_modules():
-                    if "attn2" in name and module.__class__.__name__ == "Attention":
-                        module.forward = hook_forward(module)
-
-            hook_forwards(self.unet)
-
-        output = StableDiffusionPipeline(**self.components)(
-            prompt=prompt,
-            prompt_embeds=embs,
-            negative_prompt=negative_prompt,
-            negative_prompt_embeds=n_embs,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            num_images_per_prompt=num_images_per_prompt,
-            eta=eta,
-            generator=generator,
-            latents=latents,
-            output_type=output_type,
-            return_dict=return_dict,
-            callback_on_step_end=pcallback,
-        )
-
-        if "save_mask" in rp_args:
-            save_mask = rp_args["save_mask"]
-        else:
-            save_mask = False
-
-        if mode == "PROMPT" and save_mask:
-            saveattnmaps(self, output, height, width, thresholds, num_inference_steps // 2, regions)
-
-        return output
-
-
-### Make prompt list for each regions
-def promptsmaker(prompts, batch):
-    out_p = []
-    plen = len(prompts)
-    for prompt in prompts:
-        add = ""
-        if KCOMM in prompt:
-            add, prompt = prompt.split(KCOMM)
-            add = add + " "
-        prompts = prompt.split(KBRK)
-        out_p.append([add + p for p in prompts])
-    out = [None] * batch * len(out_p[0]) * len(out_p)
-    for p, prs in enumerate(out_p):  # inputs prompts
-        for r, pr in enumerate(prs):  # prompts for regions
-            start = (p + r * plen) * batch
-            out[start : start + batch] = [pr] * batch  # P1R1B1,P1R1B2...,P1R2B1,P1R2B2...,P2R1B1...
-    return out, out_p
-
-
-### make regions from ratios
-### ";" makes outercells, "," makes inner cells
-def make_cells(ratios):
-    if ";" not in ratios and "," in ratios:
-        ratios = ratios.replace(",", ";")
-    ratios = ratios.split(";")
-    ratios = [inratios.split(",") for inratios in ratios]
-
-    icells = []
-    ocells = []
-
-    def startend(cells, array):
-        current_start = 0
-        array = [float(x) for x in array]
-        for value in array:
-            end = current_start + (value / sum(array))
-            cells.append([current_start, end])
-            current_start = end
-
-    startend(ocells, [r[0] for r in ratios])
-
-    for inratios in ratios:
-        if 2 > len(inratios):
-            icells.append([[0, 1]])
-        else:
-            add = []
-            startend(add, inratios[1:])
-            icells.append(add)
-
-    return ocells, icells, sum(len(cell) for cell in icells)
-
-
-def make_emblist(self, prompts):
-    with torch.no_grad():
-        tokens = self.tokenizer(
-            prompts, max_length=self.tokenizer.model_max_length, padding=True, truncation=True, return_tensors="pt"
-        ).input_ids.to(self.device)
-        embs = self.text_encoder(tokens, output_hidden_states=True).last_hidden_state.to(self.device, dtype=self.dtype)
-    return embs
-
-
-def split_dims(xs, height, width):
-    xs = xs
-
-    def repeat_div(x, y):
-        while y > 0:
-            x = math.ceil(x / 2)
-            y = y - 1
-        return x
-
-    scale = math.ceil(math.log2(math.sqrt(height * width / xs)))
-    dsh = repeat_div(height, scale)
-    dsw = repeat_div(width, scale)
-    return dsh, dsw
-
-
-##### for prompt mode
-def get_attn_maps(self, attn):
-    height, width = self.hw
-    target_tokens = self.target_tokens
-    if (height, width) not in self.attnmaps_sizes:
-        self.attnmaps_sizes.append((height, width))
-
-    for b in range(self.batch):
-        for t in target_tokens:
-            power = self.power
-            add = attn[b, :, :, t[0] : t[0] + len(t)] ** (power) * (self.attnmaps_sizes.index((height, width)) + 1)
-            add = torch.sum(add, dim=2)
-            key = f"{t}-{b}"
-            if key not in self.attnmaps:
-                self.attnmaps[key] = add
-            else:
-                if self.attnmaps[key].shape[1] != add.shape[1]:
-                    add = add.view(8, height, width)
-                    add = FF.resize(add, self.attnmaps_sizes[0], antialias=None)
-                    add = add.reshape_as(self.attnmaps[key])
-
-                self.attnmaps[key] = self.attnmaps[key] + add
-
-
-def reset_attnmaps(self):  # init parameters in every batch
-    self.step = 0
-    self.attnmaps = {}  # maked from attention maps
-    self.attnmaps_sizes = []  # height,width set of u-net blocks
-    self.attnmasks = {}  # maked from attnmaps for regions
-    self.maskready = False
-    self.history = {}
-
-
-def saveattnmaps(self, output, h, w, th, step, regions):
-    masks = []
-    for i, mask in enumerate(self.history[step].values()):
-        img, _, mask = makepmask(self, mask, h, w, th[i % len(th)], step)
-        if self.ex:
-            masks = [x - mask for x in masks]
-            masks.append(mask)
-            if len(masks) == regions - 1:
-                output.images.extend([FF.to_pil_image(mask) for mask in masks])
-                masks = []
-        else:
-            output.images.append(img)
-
-
-def makepmask(
-    self, mask, h, w, th, step
-):  # make masks from attention cache return [for preview, for attention, for Latent]
-    th = th - step * 0.005
-    if 0.05 >= th:
-        th = 0.05
-    mask = torch.mean(mask, dim=0)
-    mask = mask / mask.max().item()
-    mask = torch.where(mask > th, 1, 0)
-    mask = mask.float()
-    mask = mask.view(1, *self.attnmaps_sizes[0])
-    img = FF.to_pil_image(mask)
-    img = img.resize((w, h))
-    mask = FF.resize(mask, (h, w), interpolation=FF.InterpolationMode.NEAREST, antialias=None)
-    lmask = mask
-    mask = mask.reshape(h * w)
-    mask = torch.where(mask > 0.1, 1, 0)
-    return img, mask, lmask
-
-
-def tokendealer(self, all_prompts):
-    for prompts in all_prompts:
-        targets = [p.split(",")[-1] for p in prompts[1:]]
-        tt = []
-
-        for target in targets:
-            ptokens = (
-                self.tokenizer(
-                    prompts,
-                    max_length=self.tokenizer.model_max_length,
-                    padding=True,
-                    truncation=True,
-                    return_tensors="pt",
-                ).input_ids
-            )[0]
-            ttokens = (
-                self.tokenizer(
-                    target,
-                    max_length=self.tokenizer.model_max_length,
-                    padding=True,
-                    truncation=True,
-                    return_tensors="pt",
-                ).input_ids
-            )[0]
-
-            tlist = []
-
-            for t in range(ttokens.shape[0] - 2):
-                for p in range(ptokens.shape[0]):
-                    if ttokens[t + 1] == ptokens[p]:
-                        tlist.append(p)
-            if tlist != []:
-                tt.append(tlist)
-
-    return tt
-
-
-def scaled_dot_product_attention(
-    self, query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None, getattn=False
-) -> torch.Tensor:
-    # Efficient implementation equivalent to the following:
-    L, S = query.size(-2), key.size(-2)
-    scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
-    attn_bias = torch.zeros(L, S, dtype=query.dtype, device=self.device)
-    if is_causal:
-        assert attn_mask is None
-        temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
-        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
-        attn_bias.to(query.dtype)
-
-    if attn_mask is not None:
-        if attn_mask.dtype == torch.bool:
-            attn_mask.masked_fill_(attn_mask.logical_not(), float("-inf"))
-        else:
-            attn_bias += attn_mask
-    attn_weight = query @ key.transpose(-2, -1) * scale_factor
-    attn_weight += attn_bias
-    attn_weight = torch.softmax(attn_weight, dim=-1)
-    if getattn:
-        get_attn_maps(self, attn_weight)
-    attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
-    return attn_weight @ value
@@ -41,7 +41,7 @@ from polygraphy.backend.trt import (
    save_engine,
 )
 from polygraphy.backend.trt import util as trt_util
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer

 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion import (
@@ -709,7 +709,6 @@ class TensorRTStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline):
        scheduler: DDIMScheduler,
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
-        image_encoder: CLIPVisionModelWithProjection = None,
        requires_safety_checker: bool = True,
        stages=["clip", "unet", "vae", "vae_encoder"],
        image_height: int = 512,
@@ -725,15 +724,7 @@ class TensorRTStableDiffusionImg2ImgPipeline(StableDiffusionImg2ImgPipeline):
        timing_cache: str = "timing_cache",
    ):
        super().__init__(
-            vae,
-            text_encoder,
-            tokenizer,
-            unet,
-            scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
-            requires_safety_checker=requires_safety_checker,
+            vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker
        )

        self.vae.forward = self.vae.decode
@@ -41,7 +41,7 @@ from polygraphy.backend.trt import (
    save_engine,
 )
 from polygraphy.backend.trt import util as trt_util
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer

 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion import (
@@ -710,7 +710,6 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
        scheduler: DDIMScheduler,
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
-        image_encoder: CLIPVisionModelWithProjection = None,
        requires_safety_checker: bool = True,
        stages=["clip", "unet", "vae", "vae_encoder"],
        image_height: int = 512,
@@ -726,15 +725,7 @@ class TensorRTStableDiffusionInpaintPipeline(StableDiffusionInpaintPipeline):
        timing_cache: str = "timing_cache",
    ):
        super().__init__(
-            vae,
-            text_encoder,
-            tokenizer,
-            unet,
-            scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
-            requires_safety_checker=requires_safety_checker,
+            vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker
        )

        self.vae.forward = self.vae.decode
@@ -40,7 +40,7 @@ from polygraphy.backend.trt import (
    save_engine,
 )
 from polygraphy.backend.trt import util as trt_util
-from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer

 from diffusers.models import AutoencoderKL, UNet2DConditionModel
 from diffusers.pipelines.stable_diffusion import (
@@ -624,7 +624,6 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
        scheduler: DDIMScheduler,
        safety_checker: StableDiffusionSafetyChecker,
        feature_extractor: CLIPFeatureExtractor,
-        image_encoder: CLIPVisionModelWithProjection = None,
        requires_safety_checker: bool = True,
        stages=["clip", "unet", "vae"],
        image_height: int = 768,
@@ -640,15 +639,7 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
        timing_cache: str = "timing_cache",
    ):
        super().__init__(
-            vae,
-            text_encoder,
-            tokenizer,
-            unet,
-            scheduler,
-            safety_checker=safety_checker,
-            feature_extractor=feature_extractor,
-            image_encoder=image_encoder,
-            requires_safety_checker=requires_safety_checker,
+            vae, text_encoder, tokenizer, unet, scheduler, safety_checker, feature_extractor, requires_safety_checker
        )

        self.vae.forward = self.vae.decode
@@ -71,7 +71,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.18.0.dev0")

 logger = get_logger(__name__)

@@ -72,7 +72,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.18.0.dev0")

 logger = get_logger(__name__)

@@ -70,7 +70,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.18.0.dev0")

 logger = get_logger(__name__)

@@ -71,7 +71,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.18.0.dev0")

 logger = get_logger(__name__)

@@ -1,120 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import sys
-import tempfile
-
-
-sys.path.append("..")
-from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-
-
-class ControlNet(ExamplesTestsAccelerate):
-    def test_controlnet_checkpointing_checkpoints_total_limit(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-            examples/controlnet/train_controlnet.py
-            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
-            --dataset_name=hf-internal-testing/fill10
-            --output_dir={tmpdir}
-            --resolution=64
-            --train_batch_size=1
-            --gradient_accumulation_steps=1
-            --max_train_steps=6
-            --checkpoints_total_limit=2
-            --checkpointing_steps=2
-            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet
-            """.split()
-
-            run_command(self._launch_args + test_args)
-
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-4", "checkpoint-6"},
-            )
-
-    def test_controlnet_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-            examples/controlnet/train_controlnet.py
-            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
-            --dataset_name=hf-internal-testing/fill10
-            --output_dir={tmpdir}
-            --resolution=64
-            --train_batch_size=1
-            --gradient_accumulation_steps=1
-            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet
-            --max_train_steps=9
-            --checkpointing_steps=2
-            """.split()
-
-            run_command(self._launch_args + test_args)
-
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
-            )
-
-            resume_run_args = f"""
-            examples/controlnet/train_controlnet.py
-            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
-            --dataset_name=hf-internal-testing/fill10
-            --output_dir={tmpdir}
-            --resolution=64
-            --train_batch_size=1
-            --gradient_accumulation_steps=1
-            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet
-            --max_train_steps=11
-            --checkpointing_steps=2
-            --resume_from_checkpoint=checkpoint-8
-            --checkpoints_total_limit=3
-            """.split()
-
-            run_command(self._launch_args + resume_run_args)
-
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-8", "checkpoint-10", "checkpoint-12"},
-            )
-
-
-class ControlNetSDXL(ExamplesTestsAccelerate):
-    def test_controlnet_sdxl(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-            examples/controlnet/train_controlnet_sdxl.py
-            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-xl-pipe
-            --dataset_name=hf-internal-testing/fill10
-            --output_dir={tmpdir}
-            --resolution=64
-            --train_batch_size=1
-            --gradient_accumulation_steps=1
-            --controlnet_model_name_or_path=hf-internal-testing/tiny-controlnet-sdxl
-            --max_train_steps=9
-            --checkpointing_steps=2
-            """.split()
-
-            run_command(self._launch_args + test_args)
-
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
@@ -56,7 +56,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__)

@@ -59,7 +59,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = logging.getLogger(__name__)

@@ -58,7 +58,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__)

@@ -1,130 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import sys
-import tempfile
-
-
-sys.path.append("..")
-from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-
-
-class CustomDiffusion(ExamplesTestsAccelerate):
-    def test_custom_diffusion(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/custom_diffusion/train_custom_diffusion.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
-                --instance_data_dir docs/source/en/imgs
-                --instance_prompt <new1>
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 1.0e-05
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --modifier_token <new1>
-                --no_safe_serialization
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_custom_diffusion_weights.bin")))
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "<new1>.bin")))
-
-    def test_custom_diffusion_checkpointing_checkpoints_total_limit(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-            examples/custom_diffusion/train_custom_diffusion.py
-            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
-            --instance_data_dir=docs/source/en/imgs
-            --output_dir={tmpdir}
-            --instance_prompt=<new1>
-            --resolution=64
-            --train_batch_size=1
-            --modifier_token=<new1>
-            --dataloader_num_workers=0
-            --max_train_steps=6
-            --checkpoints_total_limit=2
-            --checkpointing_steps=2
-            --no_safe_serialization
-            """.split()
-
-            run_command(self._launch_args + test_args)
-
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-4", "checkpoint-6"},
-            )
-
-    def test_custom_diffusion_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-            examples/custom_diffusion/train_custom_diffusion.py
-            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
-            --instance_data_dir=docs/source/en/imgs
-            --output_dir={tmpdir}
-            --instance_prompt=<new1>
-            --resolution=64
-            --train_batch_size=1
-            --modifier_token=<new1>
-            --dataloader_num_workers=0
-            --max_train_steps=9
-            --checkpointing_steps=2
-            --no_safe_serialization
-            """.split()
-
-            run_command(self._launch_args + test_args)
-
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
-            )
-
-            resume_run_args = f"""
-            examples/custom_diffusion/train_custom_diffusion.py
-            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
-            --instance_data_dir=docs/source/en/imgs
-            --output_dir={tmpdir}
-            --instance_prompt=<new1>
-            --resolution=64
-            --train_batch_size=1
-            --modifier_token=<new1>
-            --dataloader_num_workers=0
-            --max_train_steps=11
-            --checkpointing_steps=2
-            --resume_from_checkpoint=checkpoint-8
-            --checkpoints_total_limit=3
-            --no_safe_serialization
-            """.split()
-
-            run_command(self._launch_args + resume_run_args)
-
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-6", "checkpoint-8", "checkpoint-10"},
-            )
@@ -62,7 +62,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__)

@@ -1,230 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import shutil
-import sys
-import tempfile
-
-from diffusers import DiffusionPipeline, UNet2DConditionModel
-
-
-sys.path.append("..")
-from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-
-
-class DreamBooth(ExamplesTestsAccelerate):
-    def test_dreambooth(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/dreambooth/train_dreambooth.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
-                --instance_data_dir docs/source/en/imgs
-                --instance_prompt photo
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.safetensors")))
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
-
-    def test_dreambooth_if(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/dreambooth/train_dreambooth.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-if-pipe
-                --instance_data_dir docs/source/en/imgs
-                --instance_prompt photo
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --pre_compute_text_embeddings
-                --tokenizer_max_length=77
-                --text_encoder_use_attention_mask
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.safetensors")))
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
-
-    def test_dreambooth_checkpointing(self):
-        instance_prompt = "photo"
-        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Run training script with checkpointing
-            # max_train_steps == 5, checkpointing_steps == 2
-            # Should create checkpoints at steps 2, 4
-
-            initial_run_args = f"""
-                examples/dreambooth/train_dreambooth.py
-                --pretrained_model_name_or_path {pretrained_model_name_or_path}
-                --instance_data_dir docs/source/en/imgs
-                --instance_prompt {instance_prompt}
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 5
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=2
-                --seed=0
-                """.split()
-
-            run_command(self._launch_args + initial_run_args)
-
-            # check can run the original fully trained output pipeline
-            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(instance_prompt, num_inference_steps=2)
-
-            # check checkpoint directories exist
-            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-2")))
-            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4")))
-
-            # check can run an intermediate checkpoint
-            unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet")
-            pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None)
-            pipe(instance_prompt, num_inference_steps=2)
-
-            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
-            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
-
-            # Run training script for 7 total steps resuming from checkpoint 4
-
-            resume_run_args = f"""
-                examples/dreambooth/train_dreambooth.py
-                --pretrained_model_name_or_path {pretrained_model_name_or_path}
-                --instance_data_dir docs/source/en/imgs
-                --instance_prompt {instance_prompt}
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 7
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=2
-                --resume_from_checkpoint=checkpoint-4
-                --seed=0
-                """.split()
-
-            run_command(self._launch_args + resume_run_args)
-
-            # check can run new fully trained pipeline
-            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(instance_prompt, num_inference_steps=2)
-
-            # check old checkpoints do not exist
-            self.assertFalse(os.path.isdir(os.path.join(tmpdir, "checkpoint-2")))
-
-            # check new checkpoints exist
-            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-4")))
-            self.assertTrue(os.path.isdir(os.path.join(tmpdir, "checkpoint-6")))
-
-    def test_dreambooth_checkpointing_checkpoints_total_limit(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-            examples/dreambooth/train_dreambooth.py
-            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
-            --instance_data_dir=docs/source/en/imgs
-            --output_dir={tmpdir}
-            --instance_prompt=prompt
-            --resolution=64
-            --train_batch_size=1
-            --gradient_accumulation_steps=1
-            --max_train_steps=6
-            --checkpoints_total_limit=2
-            --checkpointing_steps=2
-            """.split()
-
-            run_command(self._launch_args + test_args)
-
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-4", "checkpoint-6"},
-            )
-
-    def test_dreambooth_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-            examples/dreambooth/train_dreambooth.py
-            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
-            --instance_data_dir=docs/source/en/imgs
-            --output_dir={tmpdir}
-            --instance_prompt=prompt
-            --resolution=64
-            --train_batch_size=1
-            --gradient_accumulation_steps=1
-            --max_train_steps=9
-            --checkpointing_steps=2
-            """.split()
-
-            run_command(self._launch_args + test_args)
-
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
-            )
-
-            resume_run_args = f"""
-            examples/dreambooth/train_dreambooth.py
-            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
-            --instance_data_dir=docs/source/en/imgs
-            --output_dir={tmpdir}
-            --instance_prompt=prompt
-            --resolution=64
-            --train_batch_size=1
-            --gradient_accumulation_steps=1
-            --max_train_steps=11
-            --checkpointing_steps=2
-            --resume_from_checkpoint=checkpoint-8
-            --checkpoints_total_limit=3
-            """.split()
-
-            run_command(self._launch_args + resume_run_args)
-
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-6", "checkpoint-8", "checkpoint-10"},
-            )
@@ -1,388 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import sys
-import tempfile
-
-import safetensors
-
-
-sys.path.append("..")
-from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
-
-from diffusers import DiffusionPipeline  # noqa: E402
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-
-
-class DreamBoothLoRA(ExamplesTestsAccelerate):
-    def test_dreambooth_lora(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/dreambooth/train_dreambooth_lora.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
-                --instance_data_dir docs/source/en/imgs
-                --instance_prompt photo
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-            # when not training the text encoder, all the parameters in the state dict should start
-            # with `"unet"` in their names.
-            starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys())
-            self.assertTrue(starts_with_unet)
-
-    def test_dreambooth_lora_with_text_encoder(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/dreambooth/train_dreambooth_lora.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
-                --instance_data_dir docs/source/en/imgs
-                --instance_prompt photo
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --train_text_encoder
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # check `text_encoder` is present at all.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            keys = lora_state_dict.keys()
-            is_text_encoder_present = any(k.startswith("text_encoder") for k in keys)
-            self.assertTrue(is_text_encoder_present)
-
-            # the names of the keys of the state dict should either start with `unet`
-            # or `text_encoder`.
-            is_correct_naming = all(k.startswith("unet") or k.startswith("text_encoder") for k in keys)
-            self.assertTrue(is_correct_naming)
-
-    def test_dreambooth_lora_checkpointing_checkpoints_total_limit(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-            examples/dreambooth/train_dreambooth_lora.py
-            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
-            --instance_data_dir=docs/source/en/imgs
-            --output_dir={tmpdir}
-            --instance_prompt=prompt
-            --resolution=64
-            --train_batch_size=1
-            --gradient_accumulation_steps=1
-            --max_train_steps=6
-            --checkpoints_total_limit=2
-            --checkpointing_steps=2
-            """.split()
-
-            run_command(self._launch_args + test_args)
-
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-4", "checkpoint-6"},
-            )
-
-    def test_dreambooth_lora_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-            examples/dreambooth/train_dreambooth_lora.py
-            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
-            --instance_data_dir=docs/source/en/imgs
-            --output_dir={tmpdir}
-            --instance_prompt=prompt
-            --resolution=64
-            --train_batch_size=1
-            --gradient_accumulation_steps=1
-            --max_train_steps=9
-            --checkpointing_steps=2
-            """.split()
-
-            run_command(self._launch_args + test_args)
-
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
-            )
-
-            resume_run_args = f"""
-            examples/dreambooth/train_dreambooth_lora.py
-            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
-            --instance_data_dir=docs/source/en/imgs
-            --output_dir={tmpdir}
-            --instance_prompt=prompt
-            --resolution=64
-            --train_batch_size=1
-            --gradient_accumulation_steps=1
-            --max_train_steps=11
-            --checkpointing_steps=2
-            --resume_from_checkpoint=checkpoint-8
-            --checkpoints_total_limit=3
-            """.split()
-
-            run_command(self._launch_args + resume_run_args)
-
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-6", "checkpoint-8", "checkpoint-10"},
-            )
-
-    def test_dreambooth_lora_if_model(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/dreambooth/train_dreambooth_lora.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-if-pipe
-                --instance_data_dir docs/source/en/imgs
-                --instance_prompt photo
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --pre_compute_text_embeddings
-                --tokenizer_max_length=77
-                --text_encoder_use_attention_mask
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-            # when not training the text encoder, all the parameters in the state dict should start
-            # with `"unet"` in their names.
-            starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys())
-            self.assertTrue(starts_with_unet)
-
-
-class DreamBoothLoRASDXL(ExamplesTestsAccelerate):
-    def test_dreambooth_lora_sdxl(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/dreambooth/train_dreambooth_lora_sdxl.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
-                --instance_data_dir docs/source/en/imgs
-                --instance_prompt photo
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-            # when not training the text encoder, all the parameters in the state dict should start
-            # with `"unet"` in their names.
-            starts_with_unet = all(key.startswith("unet") for key in lora_state_dict.keys())
-            self.assertTrue(starts_with_unet)
-
-    def test_dreambooth_lora_sdxl_with_text_encoder(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/dreambooth/train_dreambooth_lora_sdxl.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
-                --instance_data_dir docs/source/en/imgs
-                --instance_prompt photo
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --train_text_encoder
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-            # when not training the text encoder, all the parameters in the state dict should start
-            # with `"unet"` or `"text_encoder"` or `"text_encoder_2"` in their names.
-            keys = lora_state_dict.keys()
-            starts_with_unet = all(
-                k.startswith("unet") or k.startswith("text_encoder") or k.startswith("text_encoder_2") for k in keys
-            )
-            self.assertTrue(starts_with_unet)
-
-    def test_dreambooth_lora_sdxl_custom_captions(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/dreambooth/train_dreambooth_lora_sdxl.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --caption_column text
-                --instance_prompt photo
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-
-    def test_dreambooth_lora_sdxl_text_encoder_custom_captions(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/dreambooth/train_dreambooth_lora_sdxl.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --caption_column text
-                --instance_prompt photo
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --train_text_encoder
-                """.split()
-
-            run_command(self._launch_args + test_args)
-
-    def test_dreambooth_lora_sdxl_checkpointing_checkpoints_total_limit(self):
-        pipeline_path = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/dreambooth/train_dreambooth_lora_sdxl.py
-                --pretrained_model_name_or_path {pipeline_path}
-                --instance_data_dir docs/source/en/imgs
-                --instance_prompt photo
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 7
-                --checkpointing_steps=2
-                --checkpoints_total_limit=2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-
-            pipe = DiffusionPipeline.from_pretrained(pipeline_path)
-            pipe.load_lora_weights(tmpdir)
-            pipe("a prompt", num_inference_steps=2)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                # checkpoint-2 should have been deleted
-                {"checkpoint-4", "checkpoint-6"},
-            )
-
-    def test_dreambooth_lora_sdxl_text_encoder_checkpointing_checkpoints_total_limit(self):
-        pipeline_path = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/dreambooth/train_dreambooth_lora_sdxl.py
-                --pretrained_model_name_or_path {pipeline_path}
-                --instance_data_dir docs/source/en/imgs
-                --instance_prompt photo
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 7
-                --checkpointing_steps=2
-                --checkpoints_total_limit=2
-                --train_text_encoder
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-
-            pipe = DiffusionPipeline.from_pretrained(pipeline_path)
-            pipe.load_lora_weights(tmpdir)
-            pipe("a prompt", num_inference_steps=2)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                # checkpoint-2 should have been deleted
-                {"checkpoint-4", "checkpoint-6"},
-            )
@@ -61,7 +61,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__)

@@ -300,7 +300,7 @@ def parse_args(input_args=None):
    parser.add_argument(
        "--output_dir",
        type=str,
-        default="dreambooth-model",
+        default="text-inversion-model",
        help="The output directory where the model predictions and checkpoints will be written.",
    )
    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
@@ -35,7 +35,7 @@ from diffusers.utils import check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 # Cache compiled models across invocations of this script.
 cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))
@@ -65,7 +65,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__)

@@ -58,7 +58,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__)

@@ -1,101 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import sys
-import tempfile
-
-
-sys.path.append("..")
-from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-
-
-class InstructPix2Pix(ExamplesTestsAccelerate):
-    def test_instruct_pix2pix_checkpointing_checkpoints_total_limit(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/instruct_pix2pix/train_instruct_pix2pix.py
-                --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
-                --dataset_name=hf-internal-testing/instructpix2pix-10-samples
-                --resolution=64
-                --random_flip
-                --train_batch_size=1
-                --max_train_steps=7
-                --checkpointing_steps=2
-                --checkpoints_total_limit=2
-                --output_dir {tmpdir}
-                --seed=0
-                """.split()
-
-            run_command(self._launch_args + test_args)
-
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-4", "checkpoint-6"},
-            )
-
-    def test_instruct_pix2pix_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/instruct_pix2pix/train_instruct_pix2pix.py
-                --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
-                --dataset_name=hf-internal-testing/instructpix2pix-10-samples
-                --resolution=64
-                --random_flip
-                --train_batch_size=1
-                --max_train_steps=9
-                --checkpointing_steps=2
-                --output_dir {tmpdir}
-                --seed=0
-                """.split()
-
-            run_command(self._launch_args + test_args)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
-            )
-
-            resume_run_args = f"""
-                examples/instruct_pix2pix/train_instruct_pix2pix.py
-                --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-pipe
-                --dataset_name=hf-internal-testing/instructpix2pix-10-samples
-                --resolution=64
-                --random_flip
-                --train_batch_size=1
-                --max_train_steps=11
-                --checkpointing_steps=2
-                --output_dir {tmpdir}
-                --seed=0
-                --resume_from_checkpoint=checkpoint-8
-                --checkpoints_total_limit=3
-                """.split()
-
-            run_command(self._launch_args + resume_run_args)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-6", "checkpoint-8", "checkpoint-10"},
-            )
@@ -52,7 +52,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -55,7 +55,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -52,7 +52,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -46,7 +46,7 @@ from diffusers.utils import check_min_version, is_wandb_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -46,7 +46,7 @@ from diffusers.utils import check_min_version, is_wandb_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -51,7 +51,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -1,51 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import sys
-import tempfile
-
-
-sys.path.append("..")
-from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-
-
-class T2IAdapter(ExamplesTestsAccelerate):
-    def test_t2i_adapter_sdxl(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-            examples/t2i_adapter/train_t2i_adapter_sdxl.py
-            --pretrained_model_name_or_path=hf-internal-testing/tiny-stable-diffusion-xl-pipe
-            --adapter_model_name_or_path=hf-internal-testing/tiny-adapter
-            --dataset_name=hf-internal-testing/fill10
-            --output_dir={tmpdir}
-            --resolution=64
-            --train_batch_size=1
-            --gradient_accumulation_steps=1
-            --max_train_steps=9
-            --checkpointing_steps=2
-            """.split()
-
-            run_command(self._launch_args + test_args)
-
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
@@ -58,7 +58,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__)

@@ -1,61 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import shutil
-import subprocess
-import tempfile
-import unittest
-from typing import List
-
-from accelerate.utils import write_basic_config
-
-
-# These utils relate to ensuring the right error message is received when running scripts
-class SubprocessCallException(Exception):
-    pass
-
-
-def run_command(command: List[str], return_stdout=False):
-    """
-    Runs `command` with `subprocess.check_output` and will potentially return the `stdout`. Will also properly capture
-    if an error occurred while running `command`
-    """
-    try:
-        output = subprocess.check_output(command, stderr=subprocess.STDOUT)
-        if return_stdout:
-            if hasattr(output, "decode"):
-                output = output.decode("utf-8")
-            return output
-    except subprocess.CalledProcessError as e:
-        raise SubprocessCallException(
-            f"Command `{' '.join(command)}` failed with the following error:\n\n{e.output.decode()}"
-        ) from e
-
-
-class ExamplesTestsAccelerate(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        super().setUpClass()
-        cls._tmpdir = tempfile.mkdtemp()
-        cls.configPath = os.path.join(cls._tmpdir, "default_config.yml")
-
-        write_basic_config(save_location=cls.configPath)
-        cls._launch_args = ["accelerate", "launch", "--config_file", cls.configPath]
-
-    @classmethod
-    def tearDownClass(cls):
-        super().tearDownClass()
-        shutil.rmtree(cls._tmpdir)
@@ -1,373 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import shutil
-import sys
-import tempfile
-
-from diffusers import DiffusionPipeline, UNet2DConditionModel  # noqa: E402
-
-
-sys.path.append("..")
-from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-
-
-class TextToImage(ExamplesTestsAccelerate):
-    def test_text_to_image(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/text_to_image/train_text_to_image.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --resolution 64
-                --center_crop
-                --random_flip
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.safetensors")))
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
-
-    def test_text_to_image_checkpointing(self):
-        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
-        prompt = "a prompt"
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Run training script with checkpointing
-            # max_train_steps == 5, checkpointing_steps == 2
-            # Should create checkpoints at steps 2, 4
-
-            initial_run_args = f"""
-                examples/text_to_image/train_text_to_image.py
-                --pretrained_model_name_or_path {pretrained_model_name_or_path}
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --resolution 64
-                --center_crop
-                --random_flip
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 5
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=2
-                --seed=0
-                """.split()
-
-            run_command(self._launch_args + initial_run_args)
-
-            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-4"},
-            )
-
-            # check can run an intermediate checkpoint
-            unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet")
-            pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
-
-            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
-            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
-
-            # Run training script for 7 total steps resuming from checkpoint 4
-
-            resume_run_args = f"""
-                examples/text_to_image/train_text_to_image.py
-                --pretrained_model_name_or_path {pretrained_model_name_or_path}
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --resolution 64
-                --center_crop
-                --random_flip
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 7
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=2
-                --resume_from_checkpoint=checkpoint-4
-                --seed=0
-                """.split()
-
-            run_command(self._launch_args + resume_run_args)
-
-            # check can run new fully trained pipeline
-            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
-
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {
-                    # no checkpoint-2 -> check old checkpoints do not exist
-                    # check new checkpoints exist
-                    "checkpoint-4",
-                    "checkpoint-6",
-                },
-            )
-
-    def test_text_to_image_checkpointing_use_ema(self):
-        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
-        prompt = "a prompt"
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Run training script with checkpointing
-            # max_train_steps == 5, checkpointing_steps == 2
-            # Should create checkpoints at steps 2, 4
-
-            initial_run_args = f"""
-                examples/text_to_image/train_text_to_image.py
-                --pretrained_model_name_or_path {pretrained_model_name_or_path}
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --resolution 64
-                --center_crop
-                --random_flip
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 5
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=2
-                --use_ema
-                --seed=0
-                """.split()
-
-            run_command(self._launch_args + initial_run_args)
-
-            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-4"},
-            )
-
-            # check can run an intermediate checkpoint
-            unet = UNet2DConditionModel.from_pretrained(tmpdir, subfolder="checkpoint-2/unet")
-            pipe = DiffusionPipeline.from_pretrained(pretrained_model_name_or_path, unet=unet, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
-
-            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
-            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
-
-            # Run training script for 7 total steps resuming from checkpoint 4
-
-            resume_run_args = f"""
-                examples/text_to_image/train_text_to_image.py
-                --pretrained_model_name_or_path {pretrained_model_name_or_path}
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --resolution 64
-                --center_crop
-                --random_flip
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 7
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=2
-                --resume_from_checkpoint=checkpoint-4
-                --use_ema
-                --seed=0
-                """.split()
-
-            run_command(self._launch_args + resume_run_args)
-
-            # check can run new fully trained pipeline
-            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
-
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {
-                    # no checkpoint-2 -> check old checkpoints do not exist
-                    # check new checkpoints exist
-                    "checkpoint-4",
-                    "checkpoint-6",
-                },
-            )
-
-    def test_text_to_image_checkpointing_checkpoints_total_limit(self):
-        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
-        prompt = "a prompt"
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Run training script with checkpointing
-            # max_train_steps == 7, checkpointing_steps == 2, checkpoints_total_limit == 2
-            # Should create checkpoints at steps 2, 4, 6
-            # with checkpoint at step 2 deleted
-
-            initial_run_args = f"""
-                examples/text_to_image/train_text_to_image.py
-                --pretrained_model_name_or_path {pretrained_model_name_or_path}
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --resolution 64
-                --center_crop
-                --random_flip
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 7
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=2
-                --checkpoints_total_limit=2
-                --seed=0
-                """.split()
-
-            run_command(self._launch_args + initial_run_args)
-
-            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                # checkpoint-2 should have been deleted
-                {"checkpoint-4", "checkpoint-6"},
-            )
-
-    def test_text_to_image_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
-        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
-        prompt = "a prompt"
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Run training script with checkpointing
-            # max_train_steps == 9, checkpointing_steps == 2
-            # Should create checkpoints at steps 2, 4, 6, 8
-
-            initial_run_args = f"""
-                examples/text_to_image/train_text_to_image.py
-                --pretrained_model_name_or_path {pretrained_model_name_or_path}
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --resolution 64
-                --center_crop
-                --random_flip
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 9
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=2
-                --seed=0
-                """.split()
-
-            run_command(self._launch_args + initial_run_args)
-
-            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
-            )
-
-            # resume and we should try to checkpoint at 10, where we'll have to remove
-            # checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint
-
-            resume_run_args = f"""
-                examples/text_to_image/train_text_to_image.py
-                --pretrained_model_name_or_path {pretrained_model_name_or_path}
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --resolution 64
-                --center_crop
-                --random_flip
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 11
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=2
-                --resume_from_checkpoint=checkpoint-8
-                --checkpoints_total_limit=3
-                --seed=0
-                """.split()
-
-            run_command(self._launch_args + resume_run_args)
-
-            pipe = DiffusionPipeline.from_pretrained(tmpdir, safety_checker=None)
-            pipe(prompt, num_inference_steps=2)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-6", "checkpoint-8", "checkpoint-10"},
-            )
-
-
-class TextToImageSDXL(ExamplesTestsAccelerate):
-    def test_text_to_image_sdxl(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/text_to_image/train_text_to_image_sdxl.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --resolution 64
-                --center_crop
-                --random_flip
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.safetensors")))
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
@@ -1,308 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import sys
-import tempfile
-
-import safetensors
-
-from diffusers import DiffusionPipeline  # noqa: E402
-
-
-sys.path.append("..")
-from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-
-
-class TextToImageLoRA(ExamplesTestsAccelerate):
-    def test_text_to_image_lora_sdxl_checkpointing_checkpoints_total_limit(self):
-        prompt = "a prompt"
-        pipeline_path = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Run training script with checkpointing
-            # max_train_steps == 7, checkpointing_steps == 2, checkpoints_total_limit == 2
-            # Should create checkpoints at steps 2, 4, 6
-            # with checkpoint at step 2 deleted
-
-            initial_run_args = f"""
-                examples/text_to_image/train_text_to_image_lora_sdxl.py
-                --pretrained_model_name_or_path {pipeline_path}
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 7
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=2
-                --checkpoints_total_limit=2
-                """.split()
-
-            run_command(self._launch_args + initial_run_args)
-
-            pipe = DiffusionPipeline.from_pretrained(pipeline_path)
-            pipe.load_lora_weights(tmpdir)
-            pipe(prompt, num_inference_steps=2)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                # checkpoint-2 should have been deleted
-                {"checkpoint-4", "checkpoint-6"},
-            )
-
-    def test_text_to_image_lora_checkpointing_checkpoints_total_limit(self):
-        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
-        prompt = "a prompt"
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Run training script with checkpointing
-            # max_train_steps == 7, checkpointing_steps == 2, checkpoints_total_limit == 2
-            # Should create checkpoints at steps 2, 4, 6
-            # with checkpoint at step 2 deleted
-
-            initial_run_args = f"""
-                examples/text_to_image/train_text_to_image_lora.py
-                --pretrained_model_name_or_path {pretrained_model_name_or_path}
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --resolution 64
-                --center_crop
-                --random_flip
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 7
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=2
-                --checkpoints_total_limit=2
-                --seed=0
-                --num_validation_images=0
-                """.split()
-
-            run_command(self._launch_args + initial_run_args)
-
-            pipe = DiffusionPipeline.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
-            )
-            pipe.load_lora_weights(tmpdir)
-            pipe(prompt, num_inference_steps=2)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                # checkpoint-2 should have been deleted
-                {"checkpoint-4", "checkpoint-6"},
-            )
-
-    def test_text_to_image_lora_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
-        pretrained_model_name_or_path = "hf-internal-testing/tiny-stable-diffusion-pipe"
-        prompt = "a prompt"
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Run training script with checkpointing
-            # max_train_steps == 9, checkpointing_steps == 2
-            # Should create checkpoints at steps 2, 4, 6, 8
-
-            initial_run_args = f"""
-                examples/text_to_image/train_text_to_image_lora.py
-                --pretrained_model_name_or_path {pretrained_model_name_or_path}
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --resolution 64
-                --center_crop
-                --random_flip
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 9
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=2
-                --seed=0
-                --num_validation_images=0
-                """.split()
-
-            run_command(self._launch_args + initial_run_args)
-
-            pipe = DiffusionPipeline.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
-            )
-            pipe.load_lora_weights(tmpdir)
-            pipe(prompt, num_inference_steps=2)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-4", "checkpoint-6", "checkpoint-8"},
-            )
-
-            # resume and we should try to checkpoint at 10, where we'll have to remove
-            # checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint
-
-            resume_run_args = f"""
-                examples/text_to_image/train_text_to_image_lora.py
-                --pretrained_model_name_or_path {pretrained_model_name_or_path}
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --resolution 64
-                --center_crop
-                --random_flip
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 11
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=2
-                --resume_from_checkpoint=checkpoint-8
-                --checkpoints_total_limit=3
-                --seed=0
-                --num_validation_images=0
-                """.split()
-
-            run_command(self._launch_args + resume_run_args)
-
-            pipe = DiffusionPipeline.from_pretrained(
-                "hf-internal-testing/tiny-stable-diffusion-pipe", safety_checker=None
-            )
-            pipe.load_lora_weights(tmpdir)
-            pipe(prompt, num_inference_steps=2)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-6", "checkpoint-8", "checkpoint-10"},
-            )
-
-
-class TextToImageLoRASDXL(ExamplesTestsAccelerate):
-    def test_text_to_image_lora_sdxl(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/text_to_image/train_text_to_image_lora_sdxl.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-    def test_text_to_image_lora_sdxl_with_text_encoder(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/text_to_image/train_text_to_image_lora_sdxl.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-xl-pipe
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --train_text_encoder
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-            # when not training the text encoder, all the parameters in the state dict should start
-            # with `"unet"` or `"text_encoder"` or `"text_encoder_2"` in their names.
-            keys = lora_state_dict.keys()
-            starts_with_unet = all(
-                k.startswith("unet") or k.startswith("text_encoder") or k.startswith("text_encoder_2") for k in keys
-            )
-            self.assertTrue(starts_with_unet)
-
-    def test_text_to_image_lora_sdxl_text_encoder_checkpointing_checkpoints_total_limit(self):
-        prompt = "a prompt"
-        pipeline_path = "hf-internal-testing/tiny-stable-diffusion-xl-pipe"
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            # Run training script with checkpointing
-            # max_train_steps == 7, checkpointing_steps == 2, checkpoints_total_limit == 2
-            # Should create checkpoints at steps 2, 4, 6
-            # with checkpoint at step 2 deleted
-
-            initial_run_args = f"""
-                examples/text_to_image/train_text_to_image_lora_sdxl.py
-                --pretrained_model_name_or_path {pipeline_path}
-                --dataset_name hf-internal-testing/dummy_image_text_data
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 7
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --train_text_encoder
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=2
-                --checkpoints_total_limit=2
-                """.split()
-
-            run_command(self._launch_args + initial_run_args)
-
-            pipe = DiffusionPipeline.from_pretrained(pipeline_path)
-            pipe.load_lora_weights(tmpdir)
-            pipe(prompt, num_inference_steps=2)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                # checkpoint-2 should have been deleted
-                {"checkpoint-4", "checkpoint-6"},
-            )
@@ -53,7 +53,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -33,7 +33,7 @@ from diffusers.utils import check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = logging.getLogger(__name__)

@@ -48,7 +48,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -58,7 +58,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__)

@@ -57,7 +57,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__)

@@ -1,160 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import sys
-import tempfile
-
-
-sys.path.append("..")
-from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-
-
-class TextualInversion(ExamplesTestsAccelerate):
-    def test_textual_inversion(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/textual_inversion/textual_inversion.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
-                --train_data_dir docs/source/en/imgs
-                --learnable_property object
-                --placeholder_token <cat-toy>
-                --initializer_token a
-                --validation_prompt <cat-toy>
-                --validation_steps 1
-                --save_steps 1
-                --num_vectors 2
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "learned_embeds.safetensors")))
-
-    def test_textual_inversion_checkpointing(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/textual_inversion/textual_inversion.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
-                --train_data_dir docs/source/en/imgs
-                --learnable_property object
-                --placeholder_token <cat-toy>
-                --initializer_token a
-                --validation_prompt <cat-toy>
-                --validation_steps 1
-                --save_steps 1
-                --num_vectors 2
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 3
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=1
-                --checkpoints_total_limit=2
-                """.split()
-
-            run_command(self._launch_args + test_args)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-2", "checkpoint-3"},
-            )
-
-    def test_textual_inversion_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/textual_inversion/textual_inversion.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
-                --train_data_dir docs/source/en/imgs
-                --learnable_property object
-                --placeholder_token <cat-toy>
-                --initializer_token a
-                --validation_prompt <cat-toy>
-                --validation_steps 1
-                --save_steps 1
-                --num_vectors 2
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 3
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=1
-                """.split()
-
-            run_command(self._launch_args + test_args)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-1", "checkpoint-2", "checkpoint-3"},
-            )
-
-            resume_run_args = f"""
-                examples/textual_inversion/textual_inversion.py
-                --pretrained_model_name_or_path hf-internal-testing/tiny-stable-diffusion-pipe
-                --train_data_dir docs/source/en/imgs
-                --learnable_property object
-                --placeholder_token <cat-toy>
-                --initializer_token a
-                --validation_prompt <cat-toy>
-                --validation_steps 1
-                --save_steps 1
-                --num_vectors 2
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 4
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --checkpointing_steps=1
-                --resume_from_checkpoint=checkpoint-3
-                --checkpoints_total_limit=2
-                """.split()
-
-            run_command(self._launch_args + resume_run_args)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-3", "checkpoint-4"},
-            )
@@ -79,7 +79,7 @@ else:


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__)

@@ -56,7 +56,7 @@ else:
 # ------------------------------------------------------------------------------

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = logging.getLogger(__name__)

@@ -1,130 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import sys
-import tempfile
-
-
-sys.path.append("..")
-from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-
-
-class Unconditional(ExamplesTestsAccelerate):
-    def test_train_unconditional(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                examples/unconditional_image_generation/train_unconditional.py
-                --dataset_name hf-internal-testing/dummy_image_class_data
-                --model_config_name_or_path diffusers/ddpm_dummy
-                --resolution 64
-                --output_dir {tmpdir}
-                --train_batch_size 2
-                --num_epochs 1
-                --gradient_accumulation_steps 1
-                --ddpm_num_inference_steps 2
-                --learning_rate 1e-3
-                --lr_warmup_steps 5
-                """.split()
-
-            run_command(self._launch_args + test_args, return_stdout=True)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "unet", "diffusion_pytorch_model.safetensors")))
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "scheduler", "scheduler_config.json")))
-
-    def test_unconditional_checkpointing_checkpoints_total_limit(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            initial_run_args = f"""
-                examples/unconditional_image_generation/train_unconditional.py
-                --dataset_name hf-internal-testing/dummy_image_class_data
-                --model_config_name_or_path diffusers/ddpm_dummy
-                --resolution 64
-                --output_dir {tmpdir}
-                --train_batch_size 1
-                --num_epochs 1
-                --gradient_accumulation_steps 1
-                --ddpm_num_inference_steps 2
-                --learning_rate 1e-3
-                --lr_warmup_steps 5
-                --checkpointing_steps=2
-                --checkpoints_total_limit=2
-                """.split()
-
-            run_command(self._launch_args + initial_run_args)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                # checkpoint-2 should have been deleted
-                {"checkpoint-4", "checkpoint-6"},
-            )
-
-    def test_unconditional_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            initial_run_args = f"""
-                examples/unconditional_image_generation/train_unconditional.py
-                --dataset_name hf-internal-testing/dummy_image_class_data
-                --model_config_name_or_path diffusers/ddpm_dummy
-                --resolution 64
-                --output_dir {tmpdir}
-                --train_batch_size 1
-                --num_epochs 1
-                --gradient_accumulation_steps 1
-                --ddpm_num_inference_steps 2
-                --learning_rate 1e-3
-                --lr_warmup_steps 5
-                --checkpointing_steps=1
-                """.split()
-
-            run_command(self._launch_args + initial_run_args)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-1", "checkpoint-2", "checkpoint-3", "checkpoint-4", "checkpoint-5", "checkpoint-6"},
-            )
-
-            resume_run_args = f"""
-                examples/unconditional_image_generation/train_unconditional.py
-                --dataset_name hf-internal-testing/dummy_image_class_data
-                --model_config_name_or_path diffusers/ddpm_dummy
-                --resolution 64
-                --output_dir {tmpdir}
-                --train_batch_size 1
-                --num_epochs 2
-                --gradient_accumulation_steps 1
-                --ddpm_num_inference_steps 2
-                --learning_rate 1e-3
-                --lr_warmup_steps 5
-                --resume_from_checkpoint=checkpoint-6
-                --checkpointing_steps=2
-                --checkpoints_total_limit=3
-                """.split()
-
-            run_command(self._launch_args + resume_run_args)
-
-            # check checkpoint directories exist
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-8", "checkpoint-10", "checkpoint-12"},
-            )
@@ -29,7 +29,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -5,4 +5,3 @@ wandb
 huggingface-cli
 bitsandbytes
 deepspeed
-peft>=0.6.0
@@ -31,14 +31,14 @@ from accelerate.utils import ProjectConfiguration, set_seed
 from datasets import load_dataset
 from huggingface_hub import create_repo, hf_hub_download, upload_folder
 from modeling_efficient_net_encoder import EfficientNetEncoder
-from peft import LoraConfig
-from peft.utils import get_peft_model_state_dict
 from torchvision import transforms
 from tqdm import tqdm
 from transformers import CLIPTextModel, PreTrainedTokenizerFast
 from transformers.utils import ContextManagers

 from diffusers import AutoPipelineForText2Image, DDPMWuerstchenScheduler, WuerstchenPriorPipeline
+from diffusers.loaders import AttnProcsLayers
+from diffusers.models.attention_processor import LoRAAttnProcessor
 from diffusers.optimization import get_scheduler
 from diffusers.pipelines.wuerstchen import DEFAULT_STAGE_C_TIMESTEPS, WuerstchenPrior
 from diffusers.utils import check_min_version, is_wandb_available, make_image_grid
@@ -50,7 +50,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -139,17 +139,17 @@ More information on all the CLI arguments and the environment are available on y
        f.write(yaml + model_card)


-def log_validation(text_encoder, tokenizer, prior, args, accelerator, weight_dtype, epoch):
+def log_validation(text_encoder, tokenizer, attn_processors, args, accelerator, weight_dtype, epoch):
    logger.info("Running validation... ")

    pipeline = AutoPipelineForText2Image.from_pretrained(
        args.pretrained_decoder_model_name_or_path,
-        prior=accelerator.unwrap_model(prior),
        prior_text_encoder=accelerator.unwrap_model(text_encoder),
        prior_tokenizer=tokenizer,
        torch_dtype=weight_dtype,
    )
    pipeline = pipeline.to(accelerator.device)
+    pipeline.prior_prior.set_attn_processor(attn_processors)
    pipeline.set_progress_bar_config(disable=True)

    if args.seed is None:
@@ -159,7 +159,7 @@ def log_validation(text_encoder, tokenizer, prior, args, accelerator, weight_dty

    images = []
    for i in range(len(args.validation_prompts)):
-        with torch.cuda.amp.autocast():
+        with torch.autocast("cuda"):
            image = pipeline(
                args.validation_prompts[i],
                prior_timesteps=DEFAULT_STAGE_C_TIMESTEPS,
@@ -167,6 +167,7 @@ def log_validation(text_encoder, tokenizer, prior, args, accelerator, weight_dty
                height=args.resolution,
                width=args.resolution,
            ).images[0]
+
        images.append(image)

    for tracker in accelerator.trackers:
@@ -526,50 +527,11 @@ def main():
    prior.to(accelerator.device, dtype=weight_dtype)

    # lora attn processor
-    prior_lora_config = LoraConfig(
-        r=args.rank, target_modules=["to_k", "to_q", "to_v", "to_out.0", "add_k_proj", "add_v_proj"]
-    )
-    prior.add_adapter(prior_lora_config)
-
-    # create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
-    def save_model_hook(models, weights, output_dir):
-        if accelerator.is_main_process:
-            prior_lora_layers_to_save = None
-
-            for model in models:
-                if isinstance(model, type(accelerator.unwrap_model(prior))):
-                    prior_lora_layers_to_save = get_peft_model_state_dict(model)
-                else:
-                    raise ValueError(f"unexpected save model: {model.__class__}")
-
-                # make sure to pop weight so that corresponding model is not saved again
-                weights.pop()
-
-            WuerstchenPriorPipeline.save_lora_weights(
-                output_dir,
-                unet_lora_layers=prior_lora_layers_to_save,
-            )
-
-    def load_model_hook(models, input_dir):
-        prior_ = None
-
-        while len(models) > 0:
-            model = models.pop()
-
-            if isinstance(model, type(accelerator.unwrap_model(prior))):
-                prior_ = model
-            else:
-                raise ValueError(f"unexpected save model: {model.__class__}")
-
-        lora_state_dict, network_alphas = WuerstchenPriorPipeline.lora_state_dict(input_dir)
-        WuerstchenPriorPipeline.load_lora_into_unet(lora_state_dict, network_alphas=network_alphas, unet=prior_)
-        WuerstchenPriorPipeline.load_lora_into_text_encoder(
-            lora_state_dict,
-            network_alphas=network_alphas,
-        )
-
-    accelerator.register_save_state_pre_hook(save_model_hook)
-    accelerator.register_load_state_pre_hook(load_model_hook)
+    lora_attn_procs = {}
+    for name in prior.attn_processors.keys():
+        lora_attn_procs[name] = LoRAAttnProcessor(hidden_size=prior.config["c"], rank=args.rank)
+    prior.set_attn_processor(lora_attn_procs)
+    lora_layers = AttnProcsLayers(prior.attn_processors)

    if args.allow_tf32:
        torch.backends.cuda.matmul.allow_tf32 = True
@@ -585,9 +547,8 @@ def main():
        optimizer_cls = bnb.optim.AdamW8bit
    else:
        optimizer_cls = torch.optim.AdamW
-    params_to_optimize = list(filter(lambda p: p.requires_grad, prior.parameters()))
    optimizer = optimizer_cls(
-        params_to_optimize,
+        lora_layers.parameters(),
        lr=args.learning_rate,
        betas=(args.adam_beta1, args.adam_beta2),
        weight_decay=args.adam_weight_decay,
@@ -713,8 +674,8 @@ def main():
        num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
    )

-    prior, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
-        prior, optimizer, train_dataloader, lr_scheduler
+    lora_layers, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
+        lora_layers, optimizer, train_dataloader, lr_scheduler
    )

    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
@@ -821,7 +782,7 @@ def main():
                # Backpropagate
                accelerator.backward(loss)
                if accelerator.sync_gradients:
-                    accelerator.clip_grad_norm_(params_to_optimize, args.max_grad_norm)
+                    accelerator.clip_grad_norm_(lora_layers.parameters(), args.max_grad_norm)
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
@@ -867,19 +828,17 @@ def main():

        if accelerator.is_main_process:
            if args.validation_prompts is not None and epoch % args.validation_epochs == 0:
-                log_validation(text_encoder, tokenizer, prior, args, accelerator, weight_dtype, global_step)
+                log_validation(
+                    text_encoder, tokenizer, prior.attn_processors, args, accelerator, weight_dtype, global_step
+                )

    # Create the pipeline using the trained modules and save it.
    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
-        prior = accelerator.unwrap_model(prior)
        prior = prior.to(torch.float32)
-
-        prior_lora_state_dict = get_peft_model_state_dict(prior)
-
        WuerstchenPriorPipeline.save_lora_weights(
-            save_directory=args.output_dir,
-            unet_lora_layers=prior_lora_state_dict,
+            os.path.join(args.output_dir, "prior_lora"),
+            unet_lora_layers=lora_layers,
        )

        # Run a final round of inference.
@@ -890,12 +849,11 @@ def main():
                args.pretrained_decoder_model_name_or_path,
                prior_text_encoder=accelerator.unwrap_model(text_encoder),
                prior_tokenizer=tokenizer,
-                torch_dtype=weight_dtype,
            )
-            pipeline = pipeline.to(accelerator.device)
-
+            pipeline = pipeline.to(accelerator.device, torch_dtype=weight_dtype)
            # load lora weights
-            pipeline.prior_pipe.load_lora_weights(args.output_dir, weight_name="pytorch_lora_weights.safetensors")
+            pipeline.prior_pipe.load_lora_weights(os.path.join(args.output_dir, "prior_lora"))
+
            pipeline.set_progress_bar_config(disable=True)

            if args.seed is None:
@@ -904,7 +862,7 @@ def main():
                generator = torch.Generator(device=accelerator.device).manual_seed(args.seed)

            for i in range(len(args.validation_prompts)):
-                with torch.cuda.amp.autocast():
+                with torch.autocast("cuda"):
                    image = pipeline(
                        args.validation_prompts[i],
                        prior_timesteps=DEFAULT_STAGE_C_TIMESTEPS,
@@ -51,7 +51,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.25.0.dev0")
+check_min_version("0.24.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -118,10 +118,9 @@ _deps = [
    "pytest-timeout",
    "pytest-xdist",
    "python>=3.8.0",
-    "ruff==0.1.5",
+    "ruff>=0.1.5,<=0.2",
    "safetensors>=0.3.1",
    "sentencepiece>=0.1.91,!=0.1.92",
-    "GitPython<3.1.19",
    "scipy",
    "onnx",
    "regex!=2019.12.17",
@@ -207,7 +206,6 @@ extras["docs"] = deps_list("hf-doc-builder")
 extras["training"] = deps_list("accelerate", "datasets", "protobuf", "tensorboard", "Jinja2")
 extras["test"] = deps_list(
    "compel",
-    "GitPython",
    "datasets",
    "Jinja2",
    "invisible-watermark",
@@ -251,13 +249,13 @@ version_range_max = max(sys.version_info[1], 10) + 1

 setup(
    name="diffusers",
-    version="0.25.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.24.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    description="State-of-the-art diffusion in PyTorch and JAX.",
    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
    keywords="deep learning diffusion jax pytorch stable diffusion audioldm",
-    license="Apache 2.0 License",
-    author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/diffusers/graphs/contributors)",
+    license="Apache",
+    author="The HuggingFace team",
    author_email="patrick@huggingface.co",
    url="https://github.com/huggingface/diffusers",
    package_dir={"": "src"},
@@ -281,3 +279,24 @@ setup(
    + [f"Programming Language :: Python :: 3.{i}" for i in range(8, version_range_max)],
    cmdclass={"deps_table_update": DepsTableUpdateCommand},
 )
+
+
+# Release checklist
+# 1. Change the version in __init__.py and setup.py.
+# 2. Commit these changes with the message: "Release: Release"
+# 3. Add a tag in git to mark the release: "git tag RELEASE -m 'Adds tag RELEASE for PyPI'"
+#    Push the tag to git: git push --tags origin main
+# 4. Run the following commands in the top-level directory:
+#      python setup.py bdist_wheel
+#      python setup.py sdist
+# 5. Upload the package to the PyPI test server first:
+#      twine upload dist/* -r pypitest
+#      twine upload dist/* -r pypitest --repository-url=https://test.pypi.org/legacy/
+# 6. Check that you can install it in a virtualenv by running:
+#      pip install -i https://testpypi.python.org/pypi diffusers
+#      diffusers env
+#      diffusers test
+# 7. Upload the final version to the actual PyPI:
+#      twine upload dist/* -r pypi
+# 8. Add release notes to the tag in GitHub once everything is looking hunky-dory.
+# 9. Update the version in __init__.py, setup.py to the new version "-dev" and push to main.
@@ -1,4 +1,4 @@
-__version__ = "0.25.0.dev0"
+__version__ = "0.24.0.dev0"

 from typing import TYPE_CHECKING

@@ -269,6 +269,7 @@ else:
            "StableDiffusionPix2PixZeroPipeline",
            "StableDiffusionSAGPipeline",
            "StableDiffusionUpscalePipeline",
+            "StableDiffusionVideoPipeline",
            "StableDiffusionXLAdapterPipeline",
            "StableDiffusionXLControlNetImg2ImgPipeline",
            "StableDiffusionXLControlNetInpaintPipeline",
@@ -279,10 +280,8 @@ else:
            "StableDiffusionXLPipeline",
            "StableUnCLIPImg2ImgPipeline",
            "StableUnCLIPPipeline",
-            "StableVideoDiffusionPipeline",
            "TextToVideoSDPipeline",
            "TextToVideoZeroPipeline",
-            "TextToVideoZeroSDXLPipeline",
            "UnCLIPImageVariationPipeline",
            "UnCLIPPipeline",
            "UniDiffuserModel",
@@ -622,6 +621,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            StableDiffusionPix2PixZeroPipeline,
            StableDiffusionSAGPipeline,
            StableDiffusionUpscalePipeline,
+            StableDiffusionVideoPipeline,
            StableDiffusionXLAdapterPipeline,
            StableDiffusionXLControlNetImg2ImgPipeline,
            StableDiffusionXLControlNetInpaintPipeline,
@@ -632,10 +632,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            StableDiffusionXLPipeline,
            StableUnCLIPImg2ImgPipeline,
            StableUnCLIPPipeline,
-            StableVideoDiffusionPipeline,
            TextToVideoSDPipeline,
            TextToVideoZeroPipeline,
-            TextToVideoZeroSDXLPipeline,
            UnCLIPImageVariationPipeline,
            UnCLIPPipeline,
            UniDiffuserModel,
@@ -30,10 +30,9 @@ deps = {
    "pytest-timeout": "pytest-timeout",
    "pytest-xdist": "pytest-xdist",
    "python": "python>=3.8.0",
-    "ruff": "ruff==0.1.5",
+    "ruff": "ruff>=0.1.5,<=0.2",
    "safetensors": "safetensors>=0.3.1",
    "sentencepiece": "sentencepiece>=0.1.91,!=0.1.92",
-    "GitPython": "GitPython<3.1.19",
    "scipy": "scipy",
    "onnx": "onnx",
    "regex": "regex!=2019.12.17",
@@ -113,7 +113,7 @@ class ValueGuidedRLPipeline(DiffusionPipeline):
            prev_x = self.unet(x.permute(0, 2, 1), timesteps).sample.permute(0, 2, 1)

            # TODO: verify deprecation of this kwarg
-            x = self.scheduler.step(prev_x, i, x)["prev_sample"]
+            x = self.scheduler.step(prev_x, i, x, predict_epsilon=False)["prev_sample"]

            # apply conditions to the trajectory (set the initial state)
            x = self.reset_x0(x, conditions, self.action_dim)
@@ -33,15 +33,6 @@ PipelineImageInput = Union[
    List[torch.FloatTensor],
 ]

-PipelineDepthInput = Union[
-    PIL.Image.Image,
-    np.ndarray,
-    torch.FloatTensor,
-    List[PIL.Image.Image],
-    List[np.ndarray],
-    List[torch.FloatTensor],
-]
-

 class VaeImageProcessor(ConfigMixin):
    """
@@ -450,18 +441,6 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):

        return pil_images

-    @staticmethod
-    def depth_pil_to_numpy(images: Union[List[PIL.Image.Image], PIL.Image.Image]) -> np.ndarray:
-        """
-        Convert a PIL image or a list of PIL images to NumPy arrays.
-        """
-        if not isinstance(images, list):
-            images = [images]
-
-        images = [np.array(image).astype(np.float32) / (2**16 - 1) for image in images]
-        images = np.stack(images, axis=0)
-        return images
-
    @staticmethod
    def rgblike_to_depthmap(image: Union[np.ndarray, torch.Tensor]) -> Union[np.ndarray, torch.Tensor]:
        """
@@ -547,102 +526,3 @@ class VaeImageProcessorLDM3D(VaeImageProcessor):
            return self.numpy_to_pil(image), self.numpy_to_depth(image)
        else:
            raise Exception(f"This type {output_type} is not supported")
-
-    def preprocess(
-        self,
-        rgb: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
-        depth: Union[torch.FloatTensor, PIL.Image.Image, np.ndarray],
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        target_res: Optional[int] = None,
-    ) -> torch.Tensor:
-        """
-        Preprocess the image input. Accepted formats are PIL images, NumPy arrays or PyTorch tensors.
-        """
-        supported_formats = (PIL.Image.Image, np.ndarray, torch.Tensor)
-
-        # Expand the missing dimension for 3-dimensional pytorch tensor or numpy array that represents grayscale image
-        if self.config.do_convert_grayscale and isinstance(rgb, (torch.Tensor, np.ndarray)) and rgb.ndim == 3:
-            raise Exception("This is not yet supported")
-
-        if isinstance(rgb, supported_formats):
-            rgb = [rgb]
-            depth = [depth]
-        elif not (isinstance(rgb, list) and all(isinstance(i, supported_formats) for i in rgb)):
-            raise ValueError(
-                f"Input is in incorrect format: {[type(i) for i in rgb]}. Currently, we only support {', '.join(supported_formats)}"
-            )
-
-        if isinstance(rgb[0], PIL.Image.Image):
-            if self.config.do_convert_rgb:
-                raise Exception("This is not yet supported")
-                # rgb = [self.convert_to_rgb(i) for i in rgb]
-                # depth = [self.convert_to_depth(i) for i in depth]  #TODO define convert_to_depth
-            if self.config.do_resize or target_res:
-                height, width = self.get_default_height_width(rgb[0], height, width) if not target_res else target_res
-                rgb = [self.resize(i, height, width) for i in rgb]
-                depth = [self.resize(i, height, width) for i in depth]
-            rgb = self.pil_to_numpy(rgb)  # to np
-            rgb = self.numpy_to_pt(rgb)  # to pt
-
-            depth = self.depth_pil_to_numpy(depth)  # to np
-            depth = self.numpy_to_pt(depth)  # to pt
-
-        elif isinstance(rgb[0], np.ndarray):
-            rgb = np.concatenate(rgb, axis=0) if rgb[0].ndim == 4 else np.stack(rgb, axis=0)
-            rgb = self.numpy_to_pt(rgb)
-            height, width = self.get_default_height_width(rgb, height, width)
-            if self.config.do_resize:
-                rgb = self.resize(rgb, height, width)
-
-            depth = np.concatenate(depth, axis=0) if rgb[0].ndim == 4 else np.stack(depth, axis=0)
-            depth = self.numpy_to_pt(depth)
-            height, width = self.get_default_height_width(depth, height, width)
-            if self.config.do_resize:
-                depth = self.resize(depth, height, width)
-
-        elif isinstance(rgb[0], torch.Tensor):
-            raise Exception("This is not yet supported")
-            # rgb = torch.cat(rgb, axis=0) if rgb[0].ndim == 4 else torch.stack(rgb, axis=0)
-
-            # if self.config.do_convert_grayscale and rgb.ndim == 3:
-            #     rgb = rgb.unsqueeze(1)
-
-            # channel = rgb.shape[1]
-
-            # height, width = self.get_default_height_width(rgb, height, width)
-            # if self.config.do_resize:
-            #     rgb = self.resize(rgb, height, width)
-
-            # depth = torch.cat(depth, axis=0) if depth[0].ndim == 4 else torch.stack(depth, axis=0)
-
-            # if self.config.do_convert_grayscale and depth.ndim == 3:
-            #     depth = depth.unsqueeze(1)
-
-            # channel = depth.shape[1]
-            # # don't need any preprocess if the image is latents
-            # if depth == 4:
-            #     return rgb, depth
-
-            # height, width = self.get_default_height_width(depth, height, width)
-            # if self.config.do_resize:
-            #     depth = self.resize(depth, height, width)
-        # expected range [0,1], normalize to [-1,1]
-        do_normalize = self.config.do_normalize
-        if rgb.min() < 0 and do_normalize:
-            warnings.warn(
-                "Passing `image` as torch tensor with value range in [-1,1] is deprecated. The expected value range for image tensor is [0,1] "
-                f"when passing as pytorch tensor or numpy Array. You passed `image` with value range [{rgb.min()},{rgb.max()}]",
-                FutureWarning,
-            )
-            do_normalize = False
-
-        if do_normalize:
-            rgb = self.normalize(rgb)
-            depth = self.normalize(depth)
-
-        if self.config.do_binarize:
-            rgb = self.binarize(rgb)
-            depth = self.binarize(depth)
-
-        return rgb, depth
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+import re
 from contextlib import nullcontext
 from typing import Callable, Dict, List, Optional, Union

@@ -43,13 +44,13 @@ from ..utils import (
    set_adapter_layers,
    set_weights_and_activate_adapters,
 )
-from .lora_conversion_utils import _convert_kohya_lora_to_diffusers, _maybe_map_sgm_blocks_to_diffusers


 if is_transformers_available():
-    from transformers import PreTrainedModel
+    from transformers import CLIPTextModel, CLIPTextModelWithProjection

-    from ..models.lora import PatchedLoraProjection, text_encoder_attn_modules, text_encoder_mlp_modules
+    # To be deprecated soon
+    from ..models.lora import PatchedLoraProjection

 if is_accelerate_available():
    from accelerate import init_empty_weights
@@ -66,10 +67,37 @@ LORA_WEIGHT_NAME_SAFE = "pytorch_lora_weights.safetensors"
 LORA_DEPRECATION_MESSAGE = "You are using an old version of LoRA backend. This will be deprecated in the next releases in favor of PEFT make sure to install the latest PEFT and transformers packages in the future."


+def text_encoder_attn_modules(text_encoder):
+    attn_modules = []
+
+    if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
+        for i, layer in enumerate(text_encoder.text_model.encoder.layers):
+            name = f"text_model.encoder.layers.{i}.self_attn"
+            mod = layer.self_attn
+            attn_modules.append((name, mod))
+    else:
+        raise ValueError(f"do not know how to get attention modules for: {text_encoder.__class__.__name__}")
+
+    return attn_modules
+
+
+def text_encoder_mlp_modules(text_encoder):
+    mlp_modules = []
+
+    if isinstance(text_encoder, (CLIPTextModel, CLIPTextModelWithProjection)):
+        for i, layer in enumerate(text_encoder.text_model.encoder.layers):
+            mlp_mod = layer.mlp
+            name = f"text_model.encoder.layers.{i}.mlp"
+            mlp_modules.append((name, mlp_mod))
+    else:
+        raise ValueError(f"do not know how to get mlp modules for: {text_encoder.__class__.__name__}")
+
+    return mlp_modules
+
+
 class LoraLoaderMixin:
    r"""
-    Load LoRA layers into [`UNet2DConditionModel`] and
-    [`CLIPTextModel`](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel).
+    Load LoRA layers into [`UNet2DConditionModel`] and [`~transformers.CLIPTextModel`].
    """

    text_encoder_name = TEXT_ENCODER_NAME
@@ -95,12 +123,28 @@ class LoraLoaderMixin:

        Parameters:
            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                See [`~loaders.LoraLoaderMixin.lora_state_dict`].
+                A string (model id of a pretrained model hosted on the Hub), a path to a directory containing the model
+                weights, or a [torch state
+                dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
            kwargs (`dict`, *optional*):
                See [`~loaders.LoraLoaderMixin.lora_state_dict`].
            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
+                Name for referencing the loaded adapter model. If not specified, it will use `default_{i}` where `i` is
+                the total number of adapters being loaded. Must have PEFT installed to use.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipeline = DiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16).to(
+            "cuda"
+        )
+        pipeline.load_lora_weights(
+            "Yntec/pineappleAnimeMix", weight_name="pineappleAnimeMix_pineapple10.1.safetensors", adapter_name="anime"
+        )
+        ```
        """
        # First, ensure that the checkpoint is a compatible one and can be successfully loaded.
        state_dict, network_alphas = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
@@ -138,15 +182,7 @@ class LoraLoaderMixin:
        **kwargs,
    ):
        r"""
-        Return state dict for lora weights and the network alphas.
-
-        <Tip warning={true}>
-
-        We support loading A1111 formatted LoRA checkpoints in a limited capacity.
-
-        This function is experimental and might change in the future.
-
-        </Tip>
+        Return state dict and network alphas of the LoRA weights.

        Parameters:
            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
@@ -154,8 +190,7 @@ class LoraLoaderMixin:

                    - A string, the *model id* (for example `google/ddpm-celebahq-256`) of a pretrained model hosted on
                      the Hub.
-                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights saved
-                      with [`ModelMixin.save_pretrained`].
+                    - A path to a *directory* (for example `./my_model_directory`) containing the model weights.
                    - A [torch state
                      dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).

@@ -191,7 +226,6 @@ class LoraLoaderMixin:
                Mirror source to resolve accessibility issues if you're downloading a model in China. We do not
                guarantee the timeliness or safety of the source, and you should refer to the mirror site for more
                information.
-
        """
        # Load the main state dict first which has the LoRA layers for either of
        # UNet and text encoder or both.
@@ -288,8 +322,8 @@ class LoraLoaderMixin:
            # Map SDXL blocks correctly.
            if unet_config is not None:
                # use unet config to remap block numbers
-                state_dict = _maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config)
-            state_dict, network_alphas = _convert_kohya_lora_to_diffusers(state_dict)
+                state_dict = cls._maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config)
+            state_dict, network_alphas = cls._convert_kohya_lora_to_diffusers(state_dict)

        return state_dict, network_alphas

@@ -329,6 +363,109 @@ class LoraLoaderMixin:
        weight_name = targeted_files[0]
        return weight_name

+    @classmethod
+    def _maybe_map_sgm_blocks_to_diffusers(cls, state_dict, unet_config, delimiter="_", block_slice_pos=5):
+        # 1. get all state_dict_keys
+        all_keys = list(state_dict.keys())
+        sgm_patterns = ["input_blocks", "middle_block", "output_blocks"]
+
+        # 2. check if needs remapping, if not return original dict
+        is_in_sgm_format = False
+        for key in all_keys:
+            if any(p in key for p in sgm_patterns):
+                is_in_sgm_format = True
+                break
+
+        if not is_in_sgm_format:
+            return state_dict
+
+        # 3. Else remap from SGM patterns
+        new_state_dict = {}
+        inner_block_map = ["resnets", "attentions", "upsamplers"]
+
+        # Retrieves # of down, mid and up blocks
+        input_block_ids, middle_block_ids, output_block_ids = set(), set(), set()
+
+        for layer in all_keys:
+            if "text" in layer:
+                new_state_dict[layer] = state_dict.pop(layer)
+            else:
+                layer_id = int(layer.split(delimiter)[:block_slice_pos][-1])
+                if sgm_patterns[0] in layer:
+                    input_block_ids.add(layer_id)
+                elif sgm_patterns[1] in layer:
+                    middle_block_ids.add(layer_id)
+                elif sgm_patterns[2] in layer:
+                    output_block_ids.add(layer_id)
+                else:
+                    raise ValueError(f"Checkpoint not supported because layer {layer} not supported.")
+
+        input_blocks = {
+            layer_id: [key for key in state_dict if f"input_blocks{delimiter}{layer_id}" in key]
+            for layer_id in input_block_ids
+        }
+        middle_blocks = {
+            layer_id: [key for key in state_dict if f"middle_block{delimiter}{layer_id}" in key]
+            for layer_id in middle_block_ids
+        }
+        output_blocks = {
+            layer_id: [key for key in state_dict if f"output_blocks{delimiter}{layer_id}" in key]
+            for layer_id in output_block_ids
+        }
+
+        # Rename keys accordingly
+        for i in input_block_ids:
+            block_id = (i - 1) // (unet_config.layers_per_block + 1)
+            layer_in_block_id = (i - 1) % (unet_config.layers_per_block + 1)
+
+            for key in input_blocks[i]:
+                inner_block_id = int(key.split(delimiter)[block_slice_pos])
+                inner_block_key = inner_block_map[inner_block_id] if "op" not in key else "downsamplers"
+                inner_layers_in_block = str(layer_in_block_id) if "op" not in key else "0"
+                new_key = delimiter.join(
+                    key.split(delimiter)[: block_slice_pos - 1]
+                    + [str(block_id), inner_block_key, inner_layers_in_block]
+                    + key.split(delimiter)[block_slice_pos + 1 :]
+                )
+                new_state_dict[new_key] = state_dict.pop(key)
+
+        for i in middle_block_ids:
+            key_part = None
+            if i == 0:
+                key_part = [inner_block_map[0], "0"]
+            elif i == 1:
+                key_part = [inner_block_map[1], "0"]
+            elif i == 2:
+                key_part = [inner_block_map[0], "1"]
+            else:
+                raise ValueError(f"Invalid middle block id {i}.")
+
+            for key in middle_blocks[i]:
+                new_key = delimiter.join(
+                    key.split(delimiter)[: block_slice_pos - 1] + key_part + key.split(delimiter)[block_slice_pos:]
+                )
+                new_state_dict[new_key] = state_dict.pop(key)
+
+        for i in output_block_ids:
+            block_id = i // (unet_config.layers_per_block + 1)
+            layer_in_block_id = i % (unet_config.layers_per_block + 1)
+
+            for key in output_blocks[i]:
+                inner_block_id = int(key.split(delimiter)[block_slice_pos])
+                inner_block_key = inner_block_map[inner_block_id]
+                inner_layers_in_block = str(layer_in_block_id) if inner_block_id < 2 else "0"
+                new_key = delimiter.join(
+                    key.split(delimiter)[: block_slice_pos - 1]
+                    + [str(block_id), inner_block_key, inner_layers_in_block]
+                    + key.split(delimiter)[block_slice_pos + 1 :]
+                )
+                new_state_dict[new_key] = state_dict.pop(key)
+
+        if len(state_dict) > 0:
+            raise ValueError("At this point all state dict entries have to be converted.")
+
+        return new_state_dict
+
    @classmethod
    def _optionally_disable_offloading(cls, _pipeline):
        """
@@ -365,25 +502,27 @@ class LoraLoaderMixin:
        cls, state_dict, network_alphas, unet, low_cpu_mem_usage=None, adapter_name=None, _pipeline=None
    ):
        """
-        This will load the LoRA layers specified in `state_dict` into `unet`.
+        Load LoRA layers specified in `state_dict` into `unet`.

        Parameters:
            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The keys can either be indexed directly
-                into the unet or prefixed with an additional `unet` which can be used to distinguish between text
-                encoder lora layers.
+                A standard state dict containing the LoRA layer parameters. The keys can either be indexed directly
+                into the `unet` or prefixed with an additional `unet`, which can be used to distinguish between text
+                encoder LoRA layers.
            network_alphas (`Dict[str, float]`):
-                See `LoRALinearLayer` for more details.
+                See
+                [`LoRALinearLayer`](https://github.com/huggingface/diffusers/blob/c697f524761abd2314c030221a3ad2f7791eab4e/src/diffusers/models/lora.py#L182)
+                for more details.
            unet (`UNet2DConditionModel`):
                The UNet model to load the LoRA layers into.
            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
-                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
-                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
-                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
-                argument to `True` will raise an error.
+                Only load and not initialize the pretrained weights. This can speedup model loading and also tries to
+                not use more than 1x model size in CPU memory (including peak memory) while loading the model. Only
+                supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this argument to
+                `True` will raise an error.
            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
+                Name for referencing the loaded adapter model. If not specified, it will use `default_{i}` where `i` is
+                the total number of adapters being loaded.
        """
        low_cpu_mem_usage = low_cpu_mem_usage if low_cpu_mem_usage is not None else _LOW_CPU_MEM_USAGE_DEFAULT
        # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
@@ -391,10 +530,6 @@ class LoraLoaderMixin:
        # their prefixes.
        keys = list(state_dict.keys())

-        if all(key.startswith("unet.unet") for key in keys):
-            deprecation_message = "Keys starting with 'unet.unet' are deprecated."
-            deprecate("unet.unet keys", "0.27", deprecation_message)
-
        if all(key.startswith(cls.unet_name) or key.startswith(cls.text_encoder_name) for key in keys):
            # Load the layers corresponding to UNet.
            logger.info(f"Loading {cls.unet_name}.")
@@ -411,9 +546,8 @@ class LoraLoaderMixin:
        else:
            # Otherwise, we're dealing with the old format. This means the `state_dict` should only
            # contain the module names of the `unet` as its keys WITHOUT any prefix.
-            if not USE_PEFT_BACKEND:
-                warn_message = "You have saved the LoRA weights using the old format. To convert the old LoRA weights to the new format, you can first load them in a dictionary and then create a new dictionary like the following: `new_state_dict = {f'unet.{module_name}': params for module_name, params in old_state_dict.items()}`."
-                logger.warn(warn_message)
+            warn_message = "You have saved the LoRA weights using the old format. To convert the old LoRA weights to the new format, you can first load them in a dictionary and then create a new dictionary like the following: `new_state_dict = {f'unet.{module_name}': params for module_name, params in old_state_dict.items()}`."
+            logger.warn(warn_message)

        if USE_PEFT_BACKEND and len(state_dict.keys()) > 0:
            from peft import LoraConfig, inject_adapter_in_model, set_peft_model_state_dict
@@ -482,26 +616,27 @@ class LoraLoaderMixin:
        _pipeline=None,
    ):
        """
-        This will load the LoRA layers specified in `state_dict` into `text_encoder`
+        Load LoRA layers specified in `state_dict` into `text_encoder`.

        Parameters:
            state_dict (`dict`):
-                A standard state dict containing the lora layer parameters. The key should be prefixed with an
-                additional `text_encoder` to distinguish between unet lora layers.
+                A standard state dict containing the LoRA layer parameters. The key should be prefixed with an
+                additional `text_encoder` to distinguish between UNet LoRA layers.
            network_alphas (`Dict[str, float]`):
-                See `LoRALinearLayer` for more details.
+                See
+                [`LoRALinearLayer`](https://github.com/huggingface/diffusers/blob/c697f524761abd2314c030221a3ad2f7791eab4e/src/diffusers/models/lora.py#L182)
+                for more details.
            text_encoder (`CLIPTextModel`):
                The text encoder model to load the LoRA layers into.
            prefix (`str`):
                Expected prefix of the `text_encoder` in the `state_dict`.
            lora_scale (`float`):
-                How much to scale the output of the lora linear layer before it is added with the output of the regular
-                lora layer.
+                Scale of `LoRALinearLayer`'s output before it is added with the output of the regular LoRA layer.
            low_cpu_mem_usage (`bool`, *optional*, defaults to `True` if torch version >= 1.9.0 else `False`):
-                Speed up model loading only loading the pretrained weights and not initializing the weights. This also
-                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
-                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
-                argument to `True` will raise an error.
+                Only load and not initialize the pretrained weights. This can speedup model loading and also tries to
+                not use more than 1x model size in CPU memory (including peak memory) while loading the model. Only
+                supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this argument to
+                `True` will raise an error.
            adapter_name (`str`, *optional*):
                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
                `default_{i}` where i is the total number of adapters being loaded.
@@ -680,7 +815,8 @@ class LoraLoaderMixin:

    @classmethod
    def _remove_text_encoder_monkey_patch_classmethod(cls, text_encoder):
-        deprecate("_remove_text_encoder_monkey_patch_classmethod", "0.27", LORA_DEPRECATION_MESSAGE)
+        if version.parse(__version__) > version.parse("0.23"):
+            deprecate("_remove_text_encoder_monkey_patch_classmethod", "0.25", LORA_DEPRECATION_MESSAGE)

        for _, attn_module in text_encoder_attn_modules(text_encoder):
            if isinstance(attn_module.q_proj, PatchedLoraProjection):
@@ -708,7 +844,8 @@ class LoraLoaderMixin:
        r"""
        Monkey-patches the forward passes of attention modules of the text encoder.
        """
-        deprecate("_modify_text_encoder", "0.27", LORA_DEPRECATION_MESSAGE)
+        if version.parse(__version__) > version.parse("0.23"):
+            deprecate("_modify_text_encoder", "0.25", LORA_DEPRECATION_MESSAGE)

        def create_patched_linear_lora(model, network_alpha, rank, dtype, lora_parameters):
            linear_layer = model.regular_linear_layer if isinstance(model, PatchedLoraProjection) else model
@@ -784,11 +921,11 @@ class LoraLoaderMixin:
        safe_serialization: bool = True,
    ):
        r"""
-        Save the LoRA parameters corresponding to the UNet and text encoder.
+        Save the UNet and text encoder LoRA parameters.

        Arguments:
            save_directory (`str` or `os.PathLike`):
-                Directory to save LoRA parameters to. Will be created if it doesn't exist.
+                Directory to save LoRA parameters to (will be created if it doesn't exist).
            unet_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
                State dict of the LoRA layers corresponding to the `unet`.
            text_encoder_lora_layers (`Dict[str, torch.nn.Module]` or `Dict[str, torch.Tensor]`):
@@ -799,27 +936,54 @@ class LoraLoaderMixin:
                need to call this function on all processes. In this case, set `is_main_process=True` only on the main
                process to avoid race conditions.
            save_function (`Callable`):
-                The function to use to save the state dictionary. Useful during distributed training when you need to
-                replace `torch.save` with another method. Can be configured with the environment variable
+                The function to use to save the state dict. Useful during distributed training when you need to replace
+                `torch.save` with another method. Can be configured with the environment variable
                `DIFFUSERS_SAVE_MODE`.
            safe_serialization (`bool`, *optional*, defaults to `True`):
-                Whether to save the model using `safetensors` or the traditional PyTorch way with `pickle`.
+                Whether to save the model using `safetensors` or with `pickle`.
+
+        Example:
+
+        ```py
+        from diffusers import StableDiffusionXLPipeline
+        from peft.utils import get_peft_model_state_dict
+        import torch
+
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.fuse_lora()
+
+        # get and save unet state dict
+        unet_state_dict = get_peft_model_state_dict(pipeline.unet, adapter_name="pixel")
+        pipeline.save_lora_weights("fused-model", unet_lora_layers=unet_state_dict)
+        pipeline.load_lora_weights("fused-model", weight_name="pytorch_lora_weights.safetensors")
+        ```
        """
+        # Create a flat dictionary.
        state_dict = {}

-        def pack_weights(layers, prefix):
-            layers_weights = layers.state_dict() if isinstance(layers, torch.nn.Module) else layers
-            layers_state_dict = {f"{prefix}.{module_name}": param for module_name, param in layers_weights.items()}
-            return layers_state_dict
+        # Populate the dictionary.
+        if unet_lora_layers is not None:
+            weights = (
+                unet_lora_layers.state_dict() if isinstance(unet_lora_layers, torch.nn.Module) else unet_lora_layers
+            )

-        if not (unet_lora_layers or text_encoder_lora_layers):
-            raise ValueError("You must pass at least one of `unet_lora_layers`, `text_encoder_lora_layers`.")
+            unet_lora_state_dict = {f"{cls.unet_name}.{module_name}": param for module_name, param in weights.items()}
+            state_dict.update(unet_lora_state_dict)

-        if unet_lora_layers:
-            state_dict.update(pack_weights(unet_lora_layers, "unet"))
+        if text_encoder_lora_layers is not None:
+            weights = (
+                text_encoder_lora_layers.state_dict()
+                if isinstance(text_encoder_lora_layers, torch.nn.Module)
+                else text_encoder_lora_layers
+            )

-        if text_encoder_lora_layers:
-            state_dict.update(pack_weights(text_encoder_lora_layers, "text_encoder"))
+            text_encoder_lora_state_dict = {
+                f"{cls.text_encoder_name}.{module_name}": param for module_name, param in weights.items()
+            }
+            state_dict.update(text_encoder_lora_state_dict)

        # Save the model
        cls.write_lora_layers(
@@ -864,16 +1028,186 @@ class LoraLoaderMixin:
        save_function(state_dict, os.path.join(save_directory, weight_name))
        logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")

+    @classmethod
+    def _convert_kohya_lora_to_diffusers(cls, state_dict):
+        unet_state_dict = {}
+        te_state_dict = {}
+        te2_state_dict = {}
+        network_alphas = {}
+
+        # every down weight has a corresponding up weight and potentially an alpha weight
+        lora_keys = [k for k in state_dict.keys() if k.endswith("lora_down.weight")]
+        for key in lora_keys:
+            lora_name = key.split(".")[0]
+            lora_name_up = lora_name + ".lora_up.weight"
+            lora_name_alpha = lora_name + ".alpha"
+
+            if lora_name.startswith("lora_unet_"):
+                diffusers_name = key.replace("lora_unet_", "").replace("_", ".")
+
+                if "input.blocks" in diffusers_name:
+                    diffusers_name = diffusers_name.replace("input.blocks", "down_blocks")
+                else:
+                    diffusers_name = diffusers_name.replace("down.blocks", "down_blocks")
+
+                if "middle.block" in diffusers_name:
+                    diffusers_name = diffusers_name.replace("middle.block", "mid_block")
+                else:
+                    diffusers_name = diffusers_name.replace("mid.block", "mid_block")
+                if "output.blocks" in diffusers_name:
+                    diffusers_name = diffusers_name.replace("output.blocks", "up_blocks")
+                else:
+                    diffusers_name = diffusers_name.replace("up.blocks", "up_blocks")
+
+                diffusers_name = diffusers_name.replace("transformer.blocks", "transformer_blocks")
+                diffusers_name = diffusers_name.replace("to.q.lora", "to_q_lora")
+                diffusers_name = diffusers_name.replace("to.k.lora", "to_k_lora")
+                diffusers_name = diffusers_name.replace("to.v.lora", "to_v_lora")
+                diffusers_name = diffusers_name.replace("to.out.0.lora", "to_out_lora")
+                diffusers_name = diffusers_name.replace("proj.in", "proj_in")
+                diffusers_name = diffusers_name.replace("proj.out", "proj_out")
+                diffusers_name = diffusers_name.replace("emb.layers", "time_emb_proj")
+
+                # SDXL specificity.
+                if "emb" in diffusers_name and "time.emb.proj" not in diffusers_name:
+                    pattern = r"\.\d+(?=\D*$)"
+                    diffusers_name = re.sub(pattern, "", diffusers_name, count=1)
+                if ".in." in diffusers_name:
+                    diffusers_name = diffusers_name.replace("in.layers.2", "conv1")
+                if ".out." in diffusers_name:
+                    diffusers_name = diffusers_name.replace("out.layers.3", "conv2")
+                if "downsamplers" in diffusers_name or "upsamplers" in diffusers_name:
+                    diffusers_name = diffusers_name.replace("op", "conv")
+                if "skip" in diffusers_name:
+                    diffusers_name = diffusers_name.replace("skip.connection", "conv_shortcut")
+
+                # LyCORIS specificity.
+                if "time.emb.proj" in diffusers_name:
+                    diffusers_name = diffusers_name.replace("time.emb.proj", "time_emb_proj")
+                if "conv.shortcut" in diffusers_name:
+                    diffusers_name = diffusers_name.replace("conv.shortcut", "conv_shortcut")
+
+                # General coverage.
+                if "transformer_blocks" in diffusers_name:
+                    if "attn1" in diffusers_name or "attn2" in diffusers_name:
+                        diffusers_name = diffusers_name.replace("attn1", "attn1.processor")
+                        diffusers_name = diffusers_name.replace("attn2", "attn2.processor")
+                        unet_state_dict[diffusers_name] = state_dict.pop(key)
+                        unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+                    elif "ff" in diffusers_name:
+                        unet_state_dict[diffusers_name] = state_dict.pop(key)
+                        unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+                elif any(key in diffusers_name for key in ("proj_in", "proj_out")):
+                    unet_state_dict[diffusers_name] = state_dict.pop(key)
+                    unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+                else:
+                    unet_state_dict[diffusers_name] = state_dict.pop(key)
+                    unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+
+            elif lora_name.startswith("lora_te_"):
+                diffusers_name = key.replace("lora_te_", "").replace("_", ".")
+                diffusers_name = diffusers_name.replace("text.model", "text_model")
+                diffusers_name = diffusers_name.replace("self.attn", "self_attn")
+                diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
+                diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
+                diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
+                diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
+                if "self_attn" in diffusers_name:
+                    te_state_dict[diffusers_name] = state_dict.pop(key)
+                    te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+                elif "mlp" in diffusers_name:
+                    # Be aware that this is the new diffusers convention and the rest of the code might
+                    # not utilize it yet.
+                    diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
+                    te_state_dict[diffusers_name] = state_dict.pop(key)
+                    te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+
+            # (sayakpaul): Duplicate code. Needs to be cleaned.
+            elif lora_name.startswith("lora_te1_"):
+                diffusers_name = key.replace("lora_te1_", "").replace("_", ".")
+                diffusers_name = diffusers_name.replace("text.model", "text_model")
+                diffusers_name = diffusers_name.replace("self.attn", "self_attn")
+                diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
+                diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
+                diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
+                diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
+                if "self_attn" in diffusers_name:
+                    te_state_dict[diffusers_name] = state_dict.pop(key)
+                    te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+                elif "mlp" in diffusers_name:
+                    # Be aware that this is the new diffusers convention and the rest of the code might
+                    # not utilize it yet.
+                    diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
+                    te_state_dict[diffusers_name] = state_dict.pop(key)
+                    te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+
+            # (sayakpaul): Duplicate code. Needs to be cleaned.
+            elif lora_name.startswith("lora_te2_"):
+                diffusers_name = key.replace("lora_te2_", "").replace("_", ".")
+                diffusers_name = diffusers_name.replace("text.model", "text_model")
+                diffusers_name = diffusers_name.replace("self.attn", "self_attn")
+                diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
+                diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
+                diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
+                diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
+                if "self_attn" in diffusers_name:
+                    te2_state_dict[diffusers_name] = state_dict.pop(key)
+                    te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+                elif "mlp" in diffusers_name:
+                    # Be aware that this is the new diffusers convention and the rest of the code might
+                    # not utilize it yet.
+                    diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
+                    te2_state_dict[diffusers_name] = state_dict.pop(key)
+                    te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+
+            # Rename the alphas so that they can be mapped appropriately.
+            if lora_name_alpha in state_dict:
+                alpha = state_dict.pop(lora_name_alpha).item()
+                if lora_name_alpha.startswith("lora_unet_"):
+                    prefix = "unet."
+                elif lora_name_alpha.startswith(("lora_te_", "lora_te1_")):
+                    prefix = "text_encoder."
+                else:
+                    prefix = "text_encoder_2."
+                new_name = prefix + diffusers_name.split(".lora.")[0] + ".alpha"
+                network_alphas.update({new_name: alpha})
+
+        if len(state_dict) > 0:
+            raise ValueError(
+                f"The following keys have not been correctly be renamed: \n\n {', '.join(state_dict.keys())}"
+            )
+
+        logger.info("Kohya-style checkpoint detected.")
+        unet_state_dict = {f"{cls.unet_name}.{module_name}": params for module_name, params in unet_state_dict.items()}
+        te_state_dict = {
+            f"{cls.text_encoder_name}.{module_name}": params for module_name, params in te_state_dict.items()
+        }
+        te2_state_dict = (
+            {f"text_encoder_2.{module_name}": params for module_name, params in te2_state_dict.items()}
+            if len(te2_state_dict) > 0
+            else None
+        )
+        if te2_state_dict is not None:
+            te_state_dict.update(te2_state_dict)
+
+        new_state_dict = {**unet_state_dict, **te_state_dict}
+        return new_state_dict, network_alphas
+
    def unload_lora_weights(self):
        """
-        Unloads the LoRA parameters.
+        Unload the LoRA parameters from a pipeline.

        Examples:

-        ```python
-        >>> # Assuming `pipeline` is already loaded with the LoRA parameters.
-        >>> pipeline.unload_lora_weights()
-        >>> ...
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.unload_lora_weights()
        ```
        """
        if not USE_PEFT_BACKEND:
@@ -902,7 +1236,7 @@ class LoraLoaderMixin:
        safe_fusing: bool = False,
    ):
        r"""
-        Fuses the LoRA parameters into the original parameters of the corresponding blocks.
+        Fuse the LoRA parameters with the original parameters in their corresponding blocks.

        <Tip warning={true}>

@@ -916,9 +1250,23 @@ class LoraLoaderMixin:
                Whether to fuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the
                LoRA parameters then it won't have any effect.
            lora_scale (`float`, defaults to 1.0):
-                Controls how much to influence the outputs with the LoRA parameters.
+                Controls LoRA influence on the outputs.
            safe_fusing (`bool`, defaults to `False`):
-                Whether to check fused weights for NaN values before fusing and if values are NaN not fusing them.
+                Whether to check fused weights for `NaN` values before fusing and if values are `NaN`, then don't fuse
+                them.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.fuse_lora(lora_scale=0.7)
+        ```
        """
        if fuse_unet or fuse_text_encoder:
            self.num_fused_loras += 1
@@ -943,7 +1291,8 @@ class LoraLoaderMixin:
                        module.merge()

        else:
-            deprecate("fuse_text_encoder_lora", "0.27", LORA_DEPRECATION_MESSAGE)
+            if version.parse(__version__) > version.parse("0.23"):
+                deprecate("fuse_text_encoder_lora", "0.25", LORA_DEPRECATION_MESSAGE)

            def fuse_text_encoder_lora(text_encoder, lora_scale=1.0, safe_fusing=False):
                for _, attn_module in text_encoder_attn_modules(text_encoder):
@@ -966,8 +1315,7 @@ class LoraLoaderMixin:

    def unfuse_lora(self, unfuse_unet: bool = True, unfuse_text_encoder: bool = True):
        r"""
-        Reverses the effect of
-        [`pipe.fuse_lora()`](https://huggingface.co/docs/diffusers/main/en/api/loaders#diffusers.loaders.LoraLoaderMixin.fuse_lora).
+        Unfuse the LoRA parameters from the original parameters in their corresponding blocks.

        <Tip warning={true}>

@@ -980,6 +1328,20 @@ class LoraLoaderMixin:
            unfuse_text_encoder (`bool`, defaults to `True`):
                Whether to unfuse the text encoder LoRA parameters. If the text encoder wasn't monkey-patched with the
                LoRA parameters then it won't have any effect.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.fuse_lora(lora_scale=0.7)
+        pipeline.unfuse_lora()
+        ```
        """
        if unfuse_unet:
            if not USE_PEFT_BACKEND:
@@ -1000,7 +1362,8 @@ class LoraLoaderMixin:
                        module.unmerge()

        else:
-            deprecate("unfuse_text_encoder_lora", "0.27", LORA_DEPRECATION_MESSAGE)
+            if version.parse(__version__) > version.parse("0.23"):
+                deprecate("unfuse_text_encoder_lora", "0.25", LORA_DEPRECATION_MESSAGE)

            def unfuse_text_encoder_lora(text_encoder):
                for _, attn_module in text_encoder_attn_modules(text_encoder):
@@ -1030,16 +1393,32 @@ class LoraLoaderMixin:
        text_encoder_weights: List[float] = None,
    ):
        """
-        Sets the adapter layers for the text encoder.
+        Set the currently active adapter for use in the text encoder.

        Args:
            adapter_names (`List[str]` or `str`):
-                The names of the adapters to use.
+                The adapter to activate.
            text_encoder (`torch.nn.Module`, *optional*):
-                The text encoder module to set the adapter layers for. If `None`, it will try to get the `text_encoder`
-                attribute.
+                The text encoder module to activate the adapter layers for. If `None`, it will try to get the
+                `text_encoder` attribute.
            text_encoder_weights (`List[float]`, *optional*):
                The weights to use for the text encoder. If `None`, the weights are set to `1.0` for all the adapters.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.load_lora_weights(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic"
+        )
+        pipeline.set_adapters_for_text_encoder("pixel")
+        ```
        """
        if not USE_PEFT_BACKEND:
            raise ValueError("PEFT backend is required for this method.")
@@ -1065,14 +1444,27 @@ class LoraLoaderMixin:
            )
        set_weights_and_activate_adapters(text_encoder, adapter_names, text_encoder_weights)

-    def disable_lora_for_text_encoder(self, text_encoder: Optional["PreTrainedModel"] = None):
+    def disable_lora_for_text_encoder(self, text_encoder: Optional["PreTrainedModel"] = None):  # noqa: F821
        """
-        Disables the LoRA layers for the text encoder.
+        Disable the text encoder's LoRA layers.

        Args:
            text_encoder (`torch.nn.Module`, *optional*):
                The text encoder module to disable the LoRA layers for. If `None`, it will try to get the
                `text_encoder` attribute.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.disable_lora_for_text_encoder()
+        ```
        """
        if not USE_PEFT_BACKEND:
            raise ValueError("PEFT backend is required for this method.")
@@ -1082,14 +1474,27 @@ class LoraLoaderMixin:
            raise ValueError("Text Encoder not found.")
        set_adapter_layers(text_encoder, enabled=False)

-    def enable_lora_for_text_encoder(self, text_encoder: Optional["PreTrainedModel"] = None):
+    def enable_lora_for_text_encoder(self, text_encoder: Optional["PreTrainedModel"] = None):  # noqa: F821
        """
-        Enables the LoRA layers for the text encoder.
+        Enables the text encoder's LoRA layers.

        Args:
            text_encoder (`torch.nn.Module`, *optional*):
                The text encoder module to enable the LoRA layers for. If `None`, it will try to get the `text_encoder`
                attribute.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.enable_lora_for_text_encoder()
+        ```
        """
        if not USE_PEFT_BACKEND:
            raise ValueError("PEFT backend is required for this method.")
@@ -1140,10 +1545,24 @@ class LoraLoaderMixin:

    def delete_adapters(self, adapter_names: Union[List[str], str]):
        """
+        Delete an adapter's LoRA layers from the UNet and text encoder(s).
+
        Args:
-        Deletes the LoRA layers of `adapter_name` for the unet and text-encoder(s).
            adapter_names (`Union[List[str], str]`):
-                The names of the adapter to delete. Can be a single string or a list of strings
+                The names (single string or list of strings) of the adapter to delete.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.delete_adapters("pixel")
+        ```
        """
        if not USE_PEFT_BACKEND:
            raise ValueError("PEFT backend is required for this method.")
@@ -1163,7 +1582,7 @@ class LoraLoaderMixin:

    def get_active_adapters(self) -> List[str]:
        """
-        Gets the list of the current active adapters.
+        Get a list of currently active adapters.

        Example:

@@ -1195,7 +1614,22 @@ class LoraLoaderMixin:

    def get_list_adapters(self) -> Dict[str, List[str]]:
        """
-        Gets the current list of all available adapters in the pipeline.
+        Get a list of all currently available adapters for each component in the pipeline.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+        ).to("cuda")
+        pipeline.load_lora_weights(
+            "jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", adapter_name="cinematic"
+        )
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.get_list_adapters()
+        ```
        """
        if not USE_PEFT_BACKEND:
            raise ValueError(
@@ -1217,14 +1651,27 @@ class LoraLoaderMixin:

    def set_lora_device(self, adapter_names: List[str], device: Union[torch.device, str, int]) -> None:
        """
-        Moves the LoRAs listed in `adapter_names` to a target device. Useful for offloading the LoRA to the CPU in case
-        you want to load multiple adapters and free some GPU memory.
+        Move a LoRA to a target device. Useful for offloading a LoRA to the CPU in case you want to load multiple
+        adapters and free some GPU memory.

        Args:
            adapter_names (`List[str]`):
-                List of adapters to send device to.
+                List of adapters to send to device.
            device (`Union[torch.device, str, int]`):
-                Device to send the adapters to. Can be either a torch device, a str or an integer.
+                Device (can be a `torch.device`, `str` or `int`) to place adapters on.
+
+        Example:
+
+        ```py
+        from diffusers import DiffusionPipeline
+        import torch
+
+        pipeline = DiffusionPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0",
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        pipeline.set_lora_device(["pixel"], device="cuda")
+        ```
        """
        if not USE_PEFT_BACKEND:
            raise ValueError("PEFT backend is required for this method.")
@@ -1256,7 +1703,7 @@ class LoraLoaderMixin:


 class StableDiffusionXLLoraLoaderMixin(LoraLoaderMixin):
-    """This class overrides `LoraLoaderMixin` with LoRA loading/saving code that's specific to SDXL"""
+    """This class overrides [`LoraLoaderMixin`] with LoRA loading/saving code that's specific to SDXL."""

    # Overrride to properly handle the loading and unloading of the additional text encoder.
    def load_lora_weights(
@@ -1281,12 +1728,26 @@ class StableDiffusionXLLoraLoaderMixin(LoraLoaderMixin):

        Parameters:
            pretrained_model_name_or_path_or_dict (`str` or `os.PathLike` or `dict`):
-                See [`~loaders.LoraLoaderMixin.lora_state_dict`].
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded.
+                A string (model id of a pretrained model hosted on the Hub), a path to a directory containing the model
+                weights, or a [torch state
+                dict](https://pytorch.org/tutorials/beginner/saving_loading_models.html#what-is-a-state-dict).
            kwargs (`dict`, *optional*):
                See [`~loaders.LoraLoaderMixin.lora_state_dict`].
+            adapter_name (`str`, *optional*):
+                Name for referencing the loaded adapter model. If not specified, it will use `default_{i}` where `i` is
+                the total number of adapters being loaded. Must have PEFT installed to use.
+
+        Example:
+
+        ```py
+        from diffusers import StableDiffusionXLPipeline
+        import torch
+
+        pipeline = StableDiffusionXLPipeline.from_pretrained(
+            "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
+        ).to("cuda")
+        pipeline.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+        ```
        """
        # We could have accessed the unet config from `lora_state_dict()` too. We pass
        # it here explicitly to be able to tell that it's coming from an SDXL
@@ -1,284 +0,0 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import re
-
-from ..utils import logging
-
-
-logger = logging.get_logger(__name__)
-
-
-def _maybe_map_sgm_blocks_to_diffusers(state_dict, unet_config, delimiter="_", block_slice_pos=5):
-    # 1. get all state_dict_keys
-    all_keys = list(state_dict.keys())
-    sgm_patterns = ["input_blocks", "middle_block", "output_blocks"]
-
-    # 2. check if needs remapping, if not return original dict
-    is_in_sgm_format = False
-    for key in all_keys:
-        if any(p in key for p in sgm_patterns):
-            is_in_sgm_format = True
-            break
-
-    if not is_in_sgm_format:
-        return state_dict
-
-    # 3. Else remap from SGM patterns
-    new_state_dict = {}
-    inner_block_map = ["resnets", "attentions", "upsamplers"]
-
-    # Retrieves # of down, mid and up blocks
-    input_block_ids, middle_block_ids, output_block_ids = set(), set(), set()
-
-    for layer in all_keys:
-        if "text" in layer:
-            new_state_dict[layer] = state_dict.pop(layer)
-        else:
-            layer_id = int(layer.split(delimiter)[:block_slice_pos][-1])
-            if sgm_patterns[0] in layer:
-                input_block_ids.add(layer_id)
-            elif sgm_patterns[1] in layer:
-                middle_block_ids.add(layer_id)
-            elif sgm_patterns[2] in layer:
-                output_block_ids.add(layer_id)
-            else:
-                raise ValueError(f"Checkpoint not supported because layer {layer} not supported.")
-
-    input_blocks = {
-        layer_id: [key for key in state_dict if f"input_blocks{delimiter}{layer_id}" in key]
-        for layer_id in input_block_ids
-    }
-    middle_blocks = {
-        layer_id: [key for key in state_dict if f"middle_block{delimiter}{layer_id}" in key]
-        for layer_id in middle_block_ids
-    }
-    output_blocks = {
-        layer_id: [key for key in state_dict if f"output_blocks{delimiter}{layer_id}" in key]
-        for layer_id in output_block_ids
-    }
-
-    # Rename keys accordingly
-    for i in input_block_ids:
-        block_id = (i - 1) // (unet_config.layers_per_block + 1)
-        layer_in_block_id = (i - 1) % (unet_config.layers_per_block + 1)
-
-        for key in input_blocks[i]:
-            inner_block_id = int(key.split(delimiter)[block_slice_pos])
-            inner_block_key = inner_block_map[inner_block_id] if "op" not in key else "downsamplers"
-            inner_layers_in_block = str(layer_in_block_id) if "op" not in key else "0"
-            new_key = delimiter.join(
-                key.split(delimiter)[: block_slice_pos - 1]
-                + [str(block_id), inner_block_key, inner_layers_in_block]
-                + key.split(delimiter)[block_slice_pos + 1 :]
-            )
-            new_state_dict[new_key] = state_dict.pop(key)
-
-    for i in middle_block_ids:
-        key_part = None
-        if i == 0:
-            key_part = [inner_block_map[0], "0"]
-        elif i == 1:
-            key_part = [inner_block_map[1], "0"]
-        elif i == 2:
-            key_part = [inner_block_map[0], "1"]
-        else:
-            raise ValueError(f"Invalid middle block id {i}.")
-
-        for key in middle_blocks[i]:
-            new_key = delimiter.join(
-                key.split(delimiter)[: block_slice_pos - 1] + key_part + key.split(delimiter)[block_slice_pos:]
-            )
-            new_state_dict[new_key] = state_dict.pop(key)
-
-    for i in output_block_ids:
-        block_id = i // (unet_config.layers_per_block + 1)
-        layer_in_block_id = i % (unet_config.layers_per_block + 1)
-
-        for key in output_blocks[i]:
-            inner_block_id = int(key.split(delimiter)[block_slice_pos])
-            inner_block_key = inner_block_map[inner_block_id]
-            inner_layers_in_block = str(layer_in_block_id) if inner_block_id < 2 else "0"
-            new_key = delimiter.join(
-                key.split(delimiter)[: block_slice_pos - 1]
-                + [str(block_id), inner_block_key, inner_layers_in_block]
-                + key.split(delimiter)[block_slice_pos + 1 :]
-            )
-            new_state_dict[new_key] = state_dict.pop(key)
-
-    if len(state_dict) > 0:
-        raise ValueError("At this point all state dict entries have to be converted.")
-
-    return new_state_dict
-
-
-def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_name="text_encoder"):
-    unet_state_dict = {}
-    te_state_dict = {}
-    te2_state_dict = {}
-    network_alphas = {}
-
-    # every down weight has a corresponding up weight and potentially an alpha weight
-    lora_keys = [k for k in state_dict.keys() if k.endswith("lora_down.weight")]
-    for key in lora_keys:
-        lora_name = key.split(".")[0]
-        lora_name_up = lora_name + ".lora_up.weight"
-        lora_name_alpha = lora_name + ".alpha"
-
-        if lora_name.startswith("lora_unet_"):
-            diffusers_name = key.replace("lora_unet_", "").replace("_", ".")
-
-            if "input.blocks" in diffusers_name:
-                diffusers_name = diffusers_name.replace("input.blocks", "down_blocks")
-            else:
-                diffusers_name = diffusers_name.replace("down.blocks", "down_blocks")
-
-            if "middle.block" in diffusers_name:
-                diffusers_name = diffusers_name.replace("middle.block", "mid_block")
-            else:
-                diffusers_name = diffusers_name.replace("mid.block", "mid_block")
-            if "output.blocks" in diffusers_name:
-                diffusers_name = diffusers_name.replace("output.blocks", "up_blocks")
-            else:
-                diffusers_name = diffusers_name.replace("up.blocks", "up_blocks")
-
-            diffusers_name = diffusers_name.replace("transformer.blocks", "transformer_blocks")
-            diffusers_name = diffusers_name.replace("to.q.lora", "to_q_lora")
-            diffusers_name = diffusers_name.replace("to.k.lora", "to_k_lora")
-            diffusers_name = diffusers_name.replace("to.v.lora", "to_v_lora")
-            diffusers_name = diffusers_name.replace("to.out.0.lora", "to_out_lora")
-            diffusers_name = diffusers_name.replace("proj.in", "proj_in")
-            diffusers_name = diffusers_name.replace("proj.out", "proj_out")
-            diffusers_name = diffusers_name.replace("emb.layers", "time_emb_proj")
-
-            # SDXL specificity.
-            if "emb" in diffusers_name and "time.emb.proj" not in diffusers_name:
-                pattern = r"\.\d+(?=\D*$)"
-                diffusers_name = re.sub(pattern, "", diffusers_name, count=1)
-            if ".in." in diffusers_name:
-                diffusers_name = diffusers_name.replace("in.layers.2", "conv1")
-            if ".out." in diffusers_name:
-                diffusers_name = diffusers_name.replace("out.layers.3", "conv2")
-            if "downsamplers" in diffusers_name or "upsamplers" in diffusers_name:
-                diffusers_name = diffusers_name.replace("op", "conv")
-            if "skip" in diffusers_name:
-                diffusers_name = diffusers_name.replace("skip.connection", "conv_shortcut")
-
-            # LyCORIS specificity.
-            if "time.emb.proj" in diffusers_name:
-                diffusers_name = diffusers_name.replace("time.emb.proj", "time_emb_proj")
-            if "conv.shortcut" in diffusers_name:
-                diffusers_name = diffusers_name.replace("conv.shortcut", "conv_shortcut")
-
-            # General coverage.
-            if "transformer_blocks" in diffusers_name:
-                if "attn1" in diffusers_name or "attn2" in diffusers_name:
-                    diffusers_name = diffusers_name.replace("attn1", "attn1.processor")
-                    diffusers_name = diffusers_name.replace("attn2", "attn2.processor")
-                    unet_state_dict[diffusers_name] = state_dict.pop(key)
-                    unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-                elif "ff" in diffusers_name:
-                    unet_state_dict[diffusers_name] = state_dict.pop(key)
-                    unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-            elif any(key in diffusers_name for key in ("proj_in", "proj_out")):
-                unet_state_dict[diffusers_name] = state_dict.pop(key)
-                unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-            else:
-                unet_state_dict[diffusers_name] = state_dict.pop(key)
-                unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-
-        elif lora_name.startswith("lora_te_"):
-            diffusers_name = key.replace("lora_te_", "").replace("_", ".")
-            diffusers_name = diffusers_name.replace("text.model", "text_model")
-            diffusers_name = diffusers_name.replace("self.attn", "self_attn")
-            diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
-            diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
-            diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
-            diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
-            if "self_attn" in diffusers_name:
-                te_state_dict[diffusers_name] = state_dict.pop(key)
-                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-            elif "mlp" in diffusers_name:
-                # Be aware that this is the new diffusers convention and the rest of the code might
-                # not utilize it yet.
-                diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
-                te_state_dict[diffusers_name] = state_dict.pop(key)
-                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-
-        # (sayakpaul): Duplicate code. Needs to be cleaned.
-        elif lora_name.startswith("lora_te1_"):
-            diffusers_name = key.replace("lora_te1_", "").replace("_", ".")
-            diffusers_name = diffusers_name.replace("text.model", "text_model")
-            diffusers_name = diffusers_name.replace("self.attn", "self_attn")
-            diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
-            diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
-            diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
-            diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
-            if "self_attn" in diffusers_name:
-                te_state_dict[diffusers_name] = state_dict.pop(key)
-                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-            elif "mlp" in diffusers_name:
-                # Be aware that this is the new diffusers convention and the rest of the code might
-                # not utilize it yet.
-                diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
-                te_state_dict[diffusers_name] = state_dict.pop(key)
-                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-
-        # (sayakpaul): Duplicate code. Needs to be cleaned.
-        elif lora_name.startswith("lora_te2_"):
-            diffusers_name = key.replace("lora_te2_", "").replace("_", ".")
-            diffusers_name = diffusers_name.replace("text.model", "text_model")
-            diffusers_name = diffusers_name.replace("self.attn", "self_attn")
-            diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
-            diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
-            diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
-            diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
-            if "self_attn" in diffusers_name:
-                te2_state_dict[diffusers_name] = state_dict.pop(key)
-                te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-            elif "mlp" in diffusers_name:
-                # Be aware that this is the new diffusers convention and the rest of the code might
-                # not utilize it yet.
-                diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
-                te2_state_dict[diffusers_name] = state_dict.pop(key)
-                te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-
-        # Rename the alphas so that they can be mapped appropriately.
-        if lora_name_alpha in state_dict:
-            alpha = state_dict.pop(lora_name_alpha).item()
-            if lora_name_alpha.startswith("lora_unet_"):
-                prefix = "unet."
-            elif lora_name_alpha.startswith(("lora_te_", "lora_te1_")):
-                prefix = "text_encoder."
-            else:
-                prefix = "text_encoder_2."
-            new_name = prefix + diffusers_name.split(".lora.")[0] + ".alpha"
-            network_alphas.update({new_name: alpha})
-
-    if len(state_dict) > 0:
-        raise ValueError(f"The following keys have not been correctly be renamed: \n\n {', '.join(state_dict.keys())}")
-
-    logger.info("Kohya-style checkpoint detected.")
-    unet_state_dict = {f"{unet_name}.{module_name}": params for module_name, params in unet_state_dict.items()}
-    te_state_dict = {f"{text_encoder_name}.{module_name}": params for module_name, params in te_state_dict.items()}
-    te2_state_dict = (
-        {f"text_encoder_2.{module_name}": params for module_name, params in te2_state_dict.items()}
-        if len(te2_state_dict) > 0
-        else None
-    )
-    if te2_state_dict is not None:
-        te_state_dict.update(te2_state_dict)
-
-    new_state_dict = {**unet_state_dict, **te_state_dict}
-    return new_state_dict, network_alphas
@@ -189,7 +189,7 @@ class TextualInversionLoaderMixin:
                f" `{self.load_textual_inversion.__name__}`"
            )

-        if len(pretrained_model_name_or_paths) > 1 and len(pretrained_model_name_or_paths) != len(tokens):
+        if len(pretrained_model_name_or_paths) != len(tokens):
            raise ValueError(
                f"You have passed a list of models of length {len(pretrained_model_name_or_paths)}, and list of tokens of length {len(tokens)} "
                f"Make sure both lists have the same length."
@@ -382,9 +382,7 @@ class TextualInversionLoaderMixin:
            if not isinstance(pretrained_model_name_or_path, list)
            else pretrained_model_name_or_path
        )
-        tokens = [token] if not isinstance(token, list) else token
-        if tokens[0] is None:
-            tokens = tokens * len(pretrained_model_name_or_paths)
+        tokens = len(pretrained_model_name_or_paths) * [token] if (isinstance(token, str) or token is None) else token

        # 3. Check inputs
        self._check_text_inv_inputs(tokenizer, text_encoder, pretrained_model_name_or_paths, tokens)
@@ -392,16 +390,6 @@ class TextualInversionLoaderMixin:
        # 4. Load state dicts of textual embeddings
        state_dicts = load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs)

-        # 4.1 Handle the special case when state_dict is a tensor that contains n embeddings for n tokens
-        if len(tokens) > 1 and len(state_dicts) == 1:
-            if isinstance(state_dicts[0], torch.Tensor):
-                state_dicts = list(state_dicts[0])
-                if len(tokens) != len(state_dicts):
-                    raise ValueError(
-                        f"You have passed a state_dict contains {len(state_dicts)} embeddings, and list of tokens of length {len(tokens)} "
-                        f"Make sure both have the same length."
-                    )
-
        # 4. Retrieve tokens and embeddings
        tokens, embeddings = self._retrieve_tokens_and_embeddings(tokens, state_dicts, tokenizer)

@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
-from collections import OrderedDict, defaultdict
+from collections import defaultdict
 from contextlib import nullcontext
 from typing import Callable, Dict, List, Optional, Union

@@ -21,7 +21,7 @@ import torch
 import torch.nn.functional as F
 from torch import nn

-from ..models.embeddings import ImageProjection, Resampler
+from ..models.embeddings import ImageProjection
 from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
 from ..utils import (
    DIFFUSERS_CACHE,
@@ -672,17 +672,6 @@ class UNet2DConditionLoadersMixin:
            IPAdapterAttnProcessor2_0,
        )

-        if "proj.weight" in state_dict["image_proj"]:
-            # IP-Adapter
-            num_image_text_embeds = 4
-        else:
-            # IP-Adapter Plus
-            num_image_text_embeds = state_dict["image_proj"]["latents"].shape[1]
-
-        # Set encoder_hid_proj after loading ip_adapter weights,
-        # because `Resampler` also has `attn_processors`.
-        self.encoder_hid_proj = None
-
        # set ip-adapter cross-attention processors & load state_dict
        attn_procs = {}
        key_id = 1
@@ -706,10 +695,7 @@ class UNet2DConditionLoadersMixin:
                    IPAdapterAttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else IPAdapterAttnProcessor
                )
                attn_procs[name] = attn_processor_class(
-                    hidden_size=hidden_size,
-                    cross_attention_dim=cross_attention_dim,
-                    scale=1.0,
-                    num_tokens=num_image_text_embeds,
+                    hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
                ).to(dtype=self.dtype, device=self.device)

                value_dict = {}
@@ -722,76 +708,26 @@ class UNet2DConditionLoadersMixin:
        self.set_attn_processor(attn_procs)

        # create image projection layers.
-        if "proj.weight" in state_dict["image_proj"]:
-            # IP-Adapter
-            clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
-            cross_attention_dim = state_dict["image_proj"]["proj.weight"].shape[0] // 4
+        clip_embeddings_dim = state_dict["image_proj"]["proj.weight"].shape[-1]
+        cross_attention_dim = state_dict["image_proj"]["proj.weight"].shape[0] // 4

-            image_projection = ImageProjection(
-                cross_attention_dim=cross_attention_dim,
-                image_embed_dim=clip_embeddings_dim,
-                num_image_text_embeds=num_image_text_embeds,
-            )
-            image_projection.to(dtype=self.dtype, device=self.device)
+        image_projection = ImageProjection(
+            cross_attention_dim=cross_attention_dim, image_embed_dim=clip_embeddings_dim, num_image_text_embeds=4
+        )
+        image_projection.to(dtype=self.dtype, device=self.device)

-            # load image projection layer weights
-            image_proj_state_dict = {}
-            image_proj_state_dict.update(
-                {
-                    "image_embeds.weight": state_dict["image_proj"]["proj.weight"],
-                    "image_embeds.bias": state_dict["image_proj"]["proj.bias"],
-                    "norm.weight": state_dict["image_proj"]["norm.weight"],
-                    "norm.bias": state_dict["image_proj"]["norm.bias"],
-                }
-            )
+        # load image projection layer weights
+        image_proj_state_dict = {}
+        image_proj_state_dict.update(
+            {
+                "image_embeds.weight": state_dict["image_proj"]["proj.weight"],
+                "image_embeds.bias": state_dict["image_proj"]["proj.bias"],
+                "norm.weight": state_dict["image_proj"]["norm.weight"],
+                "norm.bias": state_dict["image_proj"]["norm.bias"],
+            }
+        )

-            image_projection.load_state_dict(image_proj_state_dict)
-
-        else:
-            # IP-Adapter Plus
-            embed_dims = state_dict["image_proj"]["proj_in.weight"].shape[1]
-            output_dims = state_dict["image_proj"]["proj_out.weight"].shape[0]
-            hidden_dims = state_dict["image_proj"]["latents"].shape[2]
-            heads = state_dict["image_proj"]["layers.0.0.to_q.weight"].shape[0] // 64
-
-            image_projection = Resampler(
-                embed_dims=embed_dims,
-                output_dims=output_dims,
-                hidden_dims=hidden_dims,
-                heads=heads,
-                num_queries=num_image_text_embeds,
-            )
-
-            image_proj_state_dict = state_dict["image_proj"]
-
-            new_sd = OrderedDict()
-            for k, v in image_proj_state_dict.items():
-                if "0.to" in k:
-                    k = k.replace("0.to", "2.to")
-                elif "1.0.weight" in k:
-                    k = k.replace("1.0.weight", "3.0.weight")
-                elif "1.0.bias" in k:
-                    k = k.replace("1.0.bias", "3.0.bias")
-                elif "1.1.weight" in k:
-                    k = k.replace("1.1.weight", "3.1.net.0.proj.weight")
-                elif "1.3.weight" in k:
-                    k = k.replace("1.3.weight", "3.1.net.2.weight")
-
-                if "norm1" in k:
-                    new_sd[k.replace("0.norm1", "0")] = v
-                elif "norm2" in k:
-                    new_sd[k.replace("0.norm2", "1")] = v
-                elif "to_kv" in k:
-                    v_chunk = v.chunk(2, dim=0)
-                    new_sd[k.replace("to_kv", "to_k")] = v_chunk[0]
-                    new_sd[k.replace("to_kv", "to_v")] = v_chunk[1]
-                elif "to_out" in k:
-                    new_sd[k.replace("to_out", "to_out.0")] = v
-                else:
-                    new_sd[k] = v
-
-            image_projection.load_state_dict(new_sd)
-            del image_proj_state_dict
+        image_projection.load_state_dict(image_proj_state_dict)

        self.encoder_hid_proj = image_projection.to(device=self.device, dtype=self.dtype)
        self.config.encoder_hid_dim_type = "ip_image_proj"
@@ -34,7 +34,6 @@ if is_torch_available():
    _import_structure["controlnet"] = ["ControlNetModel"]
    _import_structure["dual_transformer_2d"] = ["DualTransformer2DModel"]
    _import_structure["modeling_utils"] = ["ModelMixin"]
-    _import_structure["embeddings"] = ["ImageProjection"]
    _import_structure["prior_transformer"] = ["PriorTransformer"]
    _import_structure["t5_film_transformer"] = ["T5FilmDecoder"]
    _import_structure["transformer_2d"] = ["Transformer2DModel"]
@@ -43,7 +42,7 @@ if is_torch_available():
    _import_structure["unet_2d"] = ["UNet2DModel"]
    _import_structure["unet_2d_condition"] = ["UNet2DConditionModel"]
    _import_structure["unet_3d_condition"] = ["UNet3DConditionModel"]
-    _import_structure["unet_kandinsky3"] = ["Kandinsky3UNet"]
+    _import_structure["unet_kandi3"] = ["Kandinsky3UNet"]
    _import_structure["unet_motion_model"] = ["MotionAdapter", "UNetMotionModel"]
    _import_structure["unet_spatio_temporal_condition"] = ["UNetSpatioTemporalConditionModel"]
    _import_structure["vq_model"] = ["VQModel"]
@@ -64,7 +63,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .consistency_decoder_vae import ConsistencyDecoderVAE
        from .controlnet import ControlNetModel
        from .dual_transformer_2d import DualTransformer2DModel
-        from .embeddings import ImageProjection
        from .modeling_utils import ModelMixin
        from .prior_transformer import PriorTransformer
        from .t5_film_transformer import T5FilmDecoder
@@ -74,7 +72,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .unet_2d import UNet2DModel
        from .unet_2d_condition import UNet2DConditionModel
        from .unet_3d_condition import UNet3DConditionModel
-        from .unet_kandinsky3 import Kandinsky3UNet
+        from .unet_kandi3 import Kandinsky3UNet
        from .unet_motion_model import MotionAdapter, UNetMotionModel
        from .unet_spatio_temporal_condition import UNetSpatioTemporalConditionModel
        from .vq_model import VQModel
@@ -55,12 +55,11 @@ class GELU(nn.Module):
        dim_in (`int`): The number of channels in the input.
        dim_out (`int`): The number of channels in the output.
        approximate (`str`, *optional*, defaults to `"none"`): If `"tanh"`, use tanh approximation.
-        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
    """

-    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none", bias: bool = True):
+    def __init__(self, dim_in: int, dim_out: int, approximate: str = "none"):
        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.proj = nn.Linear(dim_in, dim_out)
        self.approximate = approximate

    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
@@ -82,14 +81,13 @@ class GEGLU(nn.Module):
    Parameters:
        dim_in (`int`): The number of channels in the input.
        dim_out (`int`): The number of channels in the output.
-        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
    """

-    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+    def __init__(self, dim_in: int, dim_out: int):
        super().__init__()
        linear_cls = LoRACompatibleLinear if not USE_PEFT_BACKEND else nn.Linear

-        self.proj = linear_cls(dim_in, dim_out * 2, bias=bias)
+        self.proj = linear_cls(dim_in, dim_out * 2)

    def gelu(self, gate: torch.Tensor) -> torch.Tensor:
        if gate.device.type != "mps":
@@ -111,12 +109,11 @@ class ApproximateGELU(nn.Module):
    Parameters:
        dim_in (`int`): The number of channels in the input.
        dim_out (`int`): The number of channels in the output.
-        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
    """

-    def __init__(self, dim_in: int, dim_out: int, bias: bool = True):
+    def __init__(self, dim_in: int, dim_out: int):
        super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out, bias=bias)
+        self.proj = nn.Linear(dim_in, dim_out)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.proj(x)
@@ -25,31 +25,6 @@ from .lora import LoRACompatibleLinear
 from .normalization import AdaLayerNorm, AdaLayerNormZero


-def _chunked_feed_forward(
-    ff: nn.Module, hidden_states: torch.Tensor, chunk_dim: int, chunk_size: int, lora_scale: Optional[float] = None
-):
-    # "feed_forward_chunk_size" can be used to save memory
-    if hidden_states.shape[chunk_dim] % chunk_size != 0:
-        raise ValueError(
-            f"`hidden_states` dimension to be chunked: {hidden_states.shape[chunk_dim]} has to be divisible by chunk size: {chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
-        )
-
-    num_chunks = hidden_states.shape[chunk_dim] // chunk_size
-    if lora_scale is None:
-        ff_output = torch.cat(
-            [ff(hid_slice) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
-            dim=chunk_dim,
-        )
-    else:
-        # TOOD(Patrick): LoRA scale can be removed once PEFT refactor is complete
-        ff_output = torch.cat(
-            [ff(hid_slice, scale=lora_scale) for hid_slice in hidden_states.chunk(num_chunks, dim=chunk_dim)],
-            dim=chunk_dim,
-        )
-
-    return ff_output
-
-
@maybe_allow_in_graph
 class GatedSelfAttentionDense(nn.Module):
    r"""
@@ -238,7 +213,7 @@ class BasicTransformerBlock(nn.Module):
        self._chunk_size = None
        self._chunk_dim = 0

-    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int = 0):
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
        # Sets chunk feed-forward
        self._chunk_size = chunk_size
        self._chunk_dim = dim
@@ -341,8 +316,18 @@ class BasicTransformerBlock(nn.Module):

        if self._chunk_size is not None:
            # "feed_forward_chunk_size" can be used to save memory
-            ff_output = _chunked_feed_forward(
-                self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size, lora_scale=lora_scale
+            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
+                raise ValueError(
+                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+                )
+
+            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
+            ff_output = torch.cat(
+                [
+                    self.ff(hid_slice, scale=lora_scale)
+                    for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)
+                ],
+                dim=self._chunk_dim,
            )
        else:
            ff_output = self.ff(norm_hidden_states, scale=lora_scale)
@@ -366,10 +351,23 @@ class TemporalBasicTransformerBlock(nn.Module):

    Parameters:
        dim (`int`): The number of channels in the input and output.
-        time_mix_inner_dim (`int`): The number of channels for temporal attention.
        num_attention_heads (`int`): The number of heads to use for multi-head attention.
        attention_head_dim (`int`): The number of channels in each head.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        cross_attention_dim (`int`, *optional*): The size of the encoder_hidden_states vector for cross attention.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
+        attention_bias (:
+            obj: `bool`, *optional*, defaults to `False`): Configure if the attentions should contain a bias parameter.
+        upcast_attention (`bool`, *optional*):
+            Whether to upcast the attention computation to float32. This is useful for mixed precision training.
+        norm_elementwise_affine (`bool`, *optional*, defaults to `True`):
+            Whether to use learnable elementwise affine parameters for normalization.
+        norm_type (`str`, *optional*, defaults to `"layer_norm"`):
+            The normalization layer to use. Can be `"layer_norm"`, `"ada_norm"` or `"ada_norm_zero"`.
+        final_dropout (`bool` *optional*, defaults to False):
+            Whether to apply a final dropout after the last feed-forward layer.
+        attention_type (`str`, *optional*, defaults to `"default"`):
+            The type of attention to use. Can be `"default"` or `"gated"` or `"gated-text-image"`.
    """

    def __init__(
@@ -378,27 +376,33 @@ class TemporalBasicTransformerBlock(nn.Module):
        time_mix_inner_dim: int,
        num_attention_heads: int,
        attention_head_dim: int,
+        dropout=0.0,
        cross_attention_dim: Optional[int] = None,
+        norm_eps: float = 1e-5,
+        final_dropout: bool = False,
    ):
        super().__init__()
        self.is_res = dim == time_mix_inner_dim

-        self.norm_in = nn.LayerNorm(dim)
+        self.norm_in = nn.LayerNorm(dim, eps=norm_eps)

        # Define 3 blocks. Each block has its own normalization layer.
        # 1. Self-Attn
-        self.norm_in = nn.LayerNorm(dim)
+        self.norm_in = nn.LayerNorm(dim, eps=norm_eps)
        self.ff_in = FeedForward(
            dim,
            dim_out=time_mix_inner_dim,
+            dropout=dropout,
            activation_fn="geglu",
+            final_dropout=final_dropout,
        )

-        self.norm1 = nn.LayerNorm(time_mix_inner_dim)
+        self.norm1 = nn.LayerNorm(time_mix_inner_dim, eps=norm_eps)
        self.attn1 = Attention(
            query_dim=time_mix_inner_dim,
            heads=num_attention_heads,
            dim_head=attention_head_dim,
+            dropout=dropout,
            cross_attention_dim=None,
        )

@@ -407,36 +411,44 @@ class TemporalBasicTransformerBlock(nn.Module):
            # We currently only use AdaLayerNormZero for self attention where there will only be one attention block.
            # I.e. the number of returned modulation chunks from AdaLayerZero would not make sense if returned during
            # the second cross attention block.
-            self.norm2 = nn.LayerNorm(time_mix_inner_dim)
+            self.norm2 = nn.LayerNorm(time_mix_inner_dim, eps=norm_eps)
            self.attn2 = Attention(
                query_dim=time_mix_inner_dim,
                cross_attention_dim=cross_attention_dim,
                heads=num_attention_heads,
                dim_head=attention_head_dim,
+                dropout=dropout,
            )  # is self-attn if encoder_hidden_states is none
        else:
            self.norm2 = None
            self.attn2 = None

        # 3. Feed-forward
-        self.norm3 = nn.LayerNorm(time_mix_inner_dim)
-        self.ff = FeedForward(time_mix_inner_dim, activation_fn="geglu")
+        self.norm3 = nn.LayerNorm(time_mix_inner_dim, eps=norm_eps)
+        self.ff = FeedForward(
+            time_mix_inner_dim,
+            dropout=dropout,
+            activation_fn="geglu",
+            final_dropout=final_dropout,
+        )

        # let chunk size default to None
        self._chunk_size = None
-        self._chunk_dim = None
+        self._chunk_dim = 0

-    def set_chunk_feed_forward(self, chunk_size: Optional[int], **kwargs):
+    def set_chunk_feed_forward(self, chunk_size: Optional[int], dim: int):
        # Sets chunk feed-forward
        self._chunk_size = chunk_size
-        # chunk dim should be hardcoded to 1 to have better speed vs. memory trade-off
-        self._chunk_dim = 1
+        self._chunk_dim = dim

    def forward(
        self,
        hidden_states: torch.FloatTensor,
        num_frames: int,
+        attention_mask: Optional[torch.FloatTensor] = None,
        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
    ) -> torch.FloatTensor:
        # Notice that normalization is always applied before the real computation in the following blocks.
        # 0. Self-Attention
@@ -451,30 +463,47 @@ class TemporalBasicTransformerBlock(nn.Module):

        residual = hidden_states
        hidden_states = self.norm_in(hidden_states)
-
-        if self._chunk_size is not None:
-            hidden_states = _chunked_feed_forward(self.ff, hidden_states, self._chunk_dim, self._chunk_size)
-        else:
-            hidden_states = self.ff_in(hidden_states)
-
+        hidden_states = self.ff_in(hidden_states)
        if self.is_res:
            hidden_states = hidden_states + residual

+        cross_attention_kwargs = cross_attention_kwargs.copy() if cross_attention_kwargs is not None else {}
+
        norm_hidden_states = self.norm1(hidden_states)
-        attn_output = self.attn1(norm_hidden_states, encoder_hidden_states=None)
+        attn_output = self.attn1(
+            norm_hidden_states,
+            encoder_hidden_states=None,
+            attention_mask=attention_mask,
+            **cross_attention_kwargs,
+        )
        hidden_states = attn_output + hidden_states

        # 3. Cross-Attention
        if self.attn2 is not None:
            norm_hidden_states = self.norm2(hidden_states)
-            attn_output = self.attn2(norm_hidden_states, encoder_hidden_states=encoder_hidden_states)
+            attn_output = self.attn2(
+                norm_hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                **cross_attention_kwargs,
+            )
            hidden_states = attn_output + hidden_states

        # 4. Feed-forward
        norm_hidden_states = self.norm3(hidden_states)

        if self._chunk_size is not None:
-            ff_output = _chunked_feed_forward(self.ff, norm_hidden_states, self._chunk_dim, self._chunk_size)
+            # "feed_forward_chunk_size" can be used to save memory
+            if norm_hidden_states.shape[self._chunk_dim] % self._chunk_size != 0:
+                raise ValueError(
+                    f"`hidden_states` dimension to be chunked: {norm_hidden_states.shape[self._chunk_dim]} has to be divisible by chunk size: {self._chunk_size}. Make sure to set an appropriate `chunk_size` when calling `unet.enable_forward_chunking`."
+                )
+
+            num_chunks = norm_hidden_states.shape[self._chunk_dim] // self._chunk_size
+            ff_output = torch.cat(
+                [self.ff(hid_slice) for hid_slice in norm_hidden_states.chunk(num_chunks, dim=self._chunk_dim)],
+                dim=self._chunk_dim,
+            )
        else:
            ff_output = self.ff(norm_hidden_states)

@@ -501,7 +530,6 @@ class FeedForward(nn.Module):
        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        activation_fn (`str`, *optional*, defaults to `"geglu"`): Activation function to be used in feed-forward.
        final_dropout (`bool` *optional*, defaults to False): Apply a final dropout.
-        bias (`bool`, defaults to True): Whether to use a bias in the linear layer.
    """

    def __init__(
@@ -512,7 +540,6 @@ class FeedForward(nn.Module):
        dropout: float = 0.0,
        activation_fn: str = "geglu",
        final_dropout: bool = False,
-        bias: bool = True,
    ):
        super().__init__()
        inner_dim = int(dim * mult)
@@ -520,13 +547,13 @@ class FeedForward(nn.Module):
        linear_cls = LoRACompatibleLinear if not USE_PEFT_BACKEND else nn.Linear

        if activation_fn == "gelu":
-            act_fn = GELU(dim, inner_dim, bias=bias)
+            act_fn = GELU(dim, inner_dim)
        if activation_fn == "gelu-approximate":
-            act_fn = GELU(dim, inner_dim, approximate="tanh", bias=bias)
+            act_fn = GELU(dim, inner_dim, approximate="tanh")
        elif activation_fn == "geglu":
-            act_fn = GEGLU(dim, inner_dim, bias=bias)
+            act_fn = GEGLU(dim, inner_dim)
        elif activation_fn == "geglu-approximate":
-            act_fn = ApproximateGELU(dim, inner_dim, bias=bias)
+            act_fn = ApproximateGELU(dim, inner_dim)

        self.net = nn.ModuleList([])
        # project in
@@ -534,7 +561,7 @@ class FeedForward(nn.Module):
        # project dropout
        self.net.append(nn.Dropout(dropout))
        # project out
-        self.net.append(linear_cls(inner_dim, dim_out, bias=bias))
+        self.net.append(linear_cls(inner_dim, dim_out))
        # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
        if final_dropout:
            self.net.append(nn.Dropout(dropout))
@@ -16,7 +16,7 @@ from typing import Callable, Optional, Union

 import torch
 import torch.nn.functional as F
-from torch import nn
+from torch import einsum, nn

 from ..utils import USE_PEFT_BACKEND, deprecate, logging
 from ..utils.import_utils import is_xformers_available
@@ -109,17 +109,15 @@ class Attention(nn.Module):
        residual_connection: bool = False,
        _from_deprecated_attn_block: bool = False,
        processor: Optional["AttnProcessor"] = None,
-        out_dim: int = None,
    ):
        super().__init__()
-        self.inner_dim = out_dim if out_dim is not None else dim_head * heads
+        self.inner_dim = dim_head * heads
        self.cross_attention_dim = cross_attention_dim if cross_attention_dim is not None else query_dim
        self.upcast_attention = upcast_attention
        self.upcast_softmax = upcast_softmax
        self.rescale_output_factor = rescale_output_factor
        self.residual_connection = residual_connection
        self.dropout = dropout
-        self.out_dim = out_dim if out_dim is not None else query_dim

        # we make use of this private variable to know whether this class is loaded
        # with an deprecated state dict so that we can convert it on the fly
@@ -128,7 +126,7 @@ class Attention(nn.Module):
        self.scale_qk = scale_qk
        self.scale = dim_head**-0.5 if self.scale_qk else 1.0

-        self.heads = out_dim // dim_head if out_dim is not None else heads
+        self.heads = heads
        # for slice_size > 0 the attention score computation
        # is split across the batch axis to save memory
        # You can set slice_size with `set_attention_slice`
@@ -195,7 +193,7 @@ class Attention(nn.Module):
            self.add_v_proj = linear_cls(added_kv_proj_dim, self.inner_dim)

        self.to_out = nn.ModuleList([])
-        self.to_out.append(linear_cls(self.inner_dim, self.out_dim, bias=out_bias))
+        self.to_out.append(linear_cls(self.inner_dim, query_dim, bias=out_bias))
        self.to_out.append(nn.Dropout(dropout))

        # set attention processor
@@ -2221,6 +2219,44 @@ class IPAdapterAttnProcessor2_0(torch.nn.Module):
        return hidden_states


+# TODO(Yiyi): This class should not exist, we can replace it with a normal attention processor I believe
+# this way torch.compile and co. will work as well
+class Kandi3AttnProcessor:
+    r"""
+    Default kandinsky3 proccesor for performing attention-related computations.
+    """
+
+    @staticmethod
+    def _reshape(hid_states, h):
+        b, n, f = hid_states.shape
+        d = f // h
+        return hid_states.unsqueeze(-1).reshape(b, n, h, d).permute(0, 2, 1, 3)
+
+    def __call__(
+        self,
+        attn,
+        x,
+        context,
+        context_mask=None,
+    ):
+        query = self._reshape(attn.to_q(x), h=attn.num_heads)
+        key = self._reshape(attn.to_k(context), h=attn.num_heads)
+        value = self._reshape(attn.to_v(context), h=attn.num_heads)
+
+        attention_matrix = einsum("b h i d, b h j d -> b h i j", query, key)
+
+        if context_mask is not None:
+            max_neg_value = -torch.finfo(attention_matrix.dtype).max
+            context_mask = context_mask.unsqueeze(1).unsqueeze(1)
+            attention_matrix = attention_matrix.masked_fill(~(context_mask != 0), max_neg_value)
+        attention_matrix = (attention_matrix * attn.scale).softmax(dim=-1)
+
+        out = einsum("b h i j, b h j d -> b h i d", attention_matrix, value)
+        out = out.permute(0, 2, 1, 3).reshape(out.shape[0], out.shape[2], -1)
+        out = attn.to_out[0](out)
+        return out
+
+
 LORA_ATTENTION_PROCESSORS = (
    LoRAAttnProcessor,
    LoRAAttnProcessor2_0,
@@ -2246,6 +2282,7 @@ CROSS_ATTENTION_PROCESSORS = (
    LoRAXFormersAttnProcessor,
    IPAdapterAttnProcessor,
    IPAdapterAttnProcessor2_0,
+    Kandi3AttnProcessor,
 )

 AttentionProcessor = Union[
@@ -18,7 +18,7 @@ import torch.nn as nn

 from ..configuration_utils import ConfigMixin, register_to_config
 from ..utils.accelerate_utils import apply_forward_hook
-from .modeling_outputs import AutoencoderKLOutput
+from .autoencoder_kl import AutoencoderKLOutput
 from .modeling_utils import ModelMixin
 from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder, MaskConditionDecoder

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from dataclasses import dataclass
 from typing import Dict, Optional, Tuple, Union

 import torch
@@ -18,6 +19,7 @@ import torch.nn as nn

 from ..configuration_utils import ConfigMixin, register_to_config
 from ..loaders import FromOriginalVAEMixin
+from ..utils import BaseOutput
 from ..utils.accelerate_utils import apply_forward_hook
 from .attention_processor import (
    ADDED_KV_ATTENTION_PROCESSORS,
@@ -26,11 +28,24 @@ from .attention_processor import (
    AttnAddedKVProcessor,
    AttnProcessor,
 )
-from .modeling_outputs import AutoencoderKLOutput
 from .modeling_utils import ModelMixin
 from .vae import Decoder, DecoderOutput, DiagonalGaussianDistribution, Encoder


+@dataclass
+class AutoencoderKLOutput(BaseOutput):
+    """
+    Output of AutoencoderKL encoding method.
+
+    Args:
+        latent_dist (`DiagonalGaussianDistribution`):
+            Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
+            `DiagonalGaussianDistribution` allows for sampling latents from the distribution.
+    """
+
+    latent_dist: "DiagonalGaussianDistribution"
+
+
 class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
    r"""
    A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
@@ -11,17 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, Optional, Tuple, Union
+from dataclasses import dataclass
+from typing import Dict, Iterable, Optional, Tuple, Union

 import torch
 import torch.nn as nn

 from ..configuration_utils import ConfigMixin, register_to_config
 from ..loaders import FromOriginalVAEMixin
-from ..utils import is_torch_version
+from ..utils import BaseOutput, is_torch_version
 from ..utils.accelerate_utils import apply_forward_hook
-from .attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
-from .modeling_outputs import AutoencoderKLOutput
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
 from .modeling_utils import ModelMixin
 from .unet_3d_blocks import MidBlockTemporalDecoder, UpBlockTemporalDecoder
 from .vae import DecoderOutput, DiagonalGaussianDistribution, Encoder
@@ -32,18 +38,38 @@ class TemporalDecoder(nn.Module):
        self,
        in_channels: int = 4,
        out_channels: int = 3,
-        block_out_channels: Tuple[int] = (128, 256, 512, 512),
+        block_out_channels: Tuple[int, ...] = (
+            128,
+            256,
+            512,
+            512,
+        ),
        layers_per_block: int = 2,
+        norm_num_groups: int = 32,
+        act_fn: str = "silu",
+        norm_type: str = "group",  # group, spatial
+        alpha: float = 0.0,
+        merge_strategy: str = "learned",
+        conv_out_kernel_size=(3, 1, 1),
    ):
        super().__init__()
        self.layers_per_block = layers_per_block

        self.conv_in = nn.Conv2d(in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1)
+        temb_channels = in_channels if norm_type == "spatial" else None
        self.mid_block = MidBlockTemporalDecoder(
            num_layers=self.layers_per_block,
            in_channels=block_out_channels[-1],
            out_channels=block_out_channels[-1],
            attention_head_dim=block_out_channels[-1],
+            resnet_eps=1e-6,
+            temporal_resnet_eps=1e-5,
+            resnet_act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            temb_channels=temb_channels,
+            resnet_time_scale_shift=norm_type,
+            merge_factor=alpha,
+            merge_strategy=merge_strategy,
        )

        # up
@@ -60,11 +86,25 @@ class TemporalDecoder(nn.Module):
                in_channels=prev_output_channel,
                out_channels=output_channel,
                add_upsample=not is_final_block,
+                resnet_eps=1e-6,
+                temporal_resnet_eps=1e-5,
+                resnet_act_fn=act_fn,
+                norm_num_groups=norm_num_groups,
+                attention_head_dim=output_channel,
+                temb_channels=temb_channels,
+                resnet_time_scale_shift=norm_type,
+                merge_factor=alpha,
+                merge_strategy=merge_strategy,
            )
            self.up_blocks.append(up_block)
            prev_output_channel = output_channel

-        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=32, eps=1e-6)
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+
+        if isinstance(conv_out_kernel_size, Iterable):
+            padding = [int(k // 2) for k in conv_out_kernel_size]
+        else:
+            padding = int(conv_out_kernel_size // 2)

        self.conv_act = nn.SiLU()
        self.conv_out = torch.nn.Conv2d(
@@ -73,9 +113,6 @@ class TemporalDecoder(nn.Module):
            kernel_size=3,
            padding=1,
        )
-
-        conv_out_kernel_size = (3, 1, 1)
-        padding = [int(k // 2) for k in conv_out_kernel_size]
        self.time_conv_out = torch.nn.Conv3d(
            in_channels=out_channels,
            out_channels=out_channels,
@@ -90,6 +127,7 @@ class TemporalDecoder(nn.Module):
        sample: torch.FloatTensor,
        image_only_indicator: torch.FloatTensor,
        num_frames: int = 1,
+        latent_embeds: Optional[torch.FloatTensor] = None,
    ) -> torch.FloatTensor:
        r"""The forward method of the `Decoder` class."""

@@ -110,6 +148,8 @@ class TemporalDecoder(nn.Module):
                    create_custom_forward(self.mid_block),
                    sample,
                    image_only_indicator,
+                    latent_embeds,
+                    num_frames,
                    use_reentrant=False,
                )
                sample = sample.to(upscale_dtype)
@@ -120,6 +160,8 @@ class TemporalDecoder(nn.Module):
                        create_custom_forward(up_block),
                        sample,
                        image_only_indicator,
+                        latent_embeds,
+                        num_frames,
                        use_reentrant=False,
                    )
            else:
@@ -128,6 +170,8 @@ class TemporalDecoder(nn.Module):
                    create_custom_forward(self.mid_block),
                    sample,
                    image_only_indicator,
+                    latent_embeds,
+                    num_frames,
                )
                sample = sample.to(upscale_dtype)

@@ -137,18 +181,34 @@ class TemporalDecoder(nn.Module):
                        create_custom_forward(up_block),
                        sample,
                        image_only_indicator,
+                        latent_embeds,
+                        num_frames,
                    )
        else:
            # middle
-            sample = self.mid_block(sample, image_only_indicator=image_only_indicator)
+            sample = self.mid_block(
+                sample,
+                temb=latent_embeds,
+                num_frames=num_frames,
+                image_only_indicator=image_only_indicator,
+            )
            sample = sample.to(upscale_dtype)

            # up
            for up_block in self.up_blocks:
-                sample = up_block(sample, image_only_indicator=image_only_indicator)
+                sample = up_block(
+                    sample,
+                    temb=latent_embeds,
+                    num_frames=num_frames,
+                    image_only_indicator=image_only_indicator,
+                )

        # post-process
-        sample = self.conv_norm_out(sample)
+        if latent_embeds is None:
+            sample = self.conv_norm_out(sample)
+        else:
+            sample = self.conv_norm_out(sample, latent_embeds)
+
        sample = self.conv_act(sample)
        sample = self.conv_out(sample)

@@ -162,6 +222,20 @@ class TemporalDecoder(nn.Module):
        return sample


+@dataclass
+class AutoencoderKLOutput(BaseOutput):
+    """
+    Output of AutoencoderKL encoding method.
+
+    Args:
+        latent_dist (`DiagonalGaussianDistribution`):
+            Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
+            `DiagonalGaussianDistribution` allows for sampling latents from the distribution.
+    """
+
+    latent_dist: "DiagonalGaussianDistribution"
+
+
 class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin, FromOriginalVAEMixin):
    r"""
    A VAE model with KL loss for encoding images into latents and decoding latent representations into images.
@@ -174,9 +248,11 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin, FromOriginalVAEMixin
        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
        down_block_types (`Tuple[str]`, *optional*, defaults to `("DownEncoderBlock2D",)`):
            Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpDecoderBlock2D",)`):
+            Tuple of upsample block types.
        block_out_channels (`Tuple[int]`, *optional*, defaults to `(64,)`):
            Tuple of block output channels.
-        layers_per_block: (`int`, *optional*, defaults to 1): Number of layers per block.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
        latent_channels (`int`, *optional*, defaults to 4): Number of channels in the latent space.
        sample_size (`int`, *optional*, defaults to `32`): Sample input size.
        scaling_factor (`float`, *optional*, defaults to 0.18215):
@@ -202,7 +278,9 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin, FromOriginalVAEMixin
        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
        block_out_channels: Tuple[int] = (64,),
        layers_per_block: int = 1,
+        act_fn: str = "silu",
        latent_channels: int = 4,
+        norm_num_groups: int = 32,
        sample_size: int = 32,
        scaling_factor: float = 0.18215,
        force_upcast: float = True,
@@ -216,6 +294,8 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin, FromOriginalVAEMixin
            down_block_types=down_block_types,
            block_out_channels=block_out_channels,
            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
            double_z=True,
        )

@@ -225,10 +305,17 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin, FromOriginalVAEMixin
            out_channels=out_channels,
            block_out_channels=block_out_channels,
            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
        )

        self.quant_conv = nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)

+        self.use_slicing = False
+        self.use_tiling = False
+
+        # only relevant if vae tiling is enabled
+        self.tile_sample_min_size = self.config.sample_size
        sample_size = (
            self.config.sample_size[0]
            if isinstance(self.config.sample_size, (list, tuple))
@@ -241,6 +328,35 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin, FromOriginalVAEMixin
        if isinstance(module, (Encoder, TemporalDecoder)):
            module.gradient_checkpointing = value

+    def enable_tiling(self, use_tiling: bool = True):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.use_tiling = use_tiling
+
+    def disable_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.enable_tiling(False)
+
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
    @property
    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.attn_processors
    def attn_processors(self) -> Dict[str, AttentionProcessor]:
@@ -303,11 +419,14 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin, FromOriginalVAEMixin
        for name, module in self.named_children():
            fn_recursive_attn_processor(name, module, processor)

+    # Copied from diffusers.models.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor
    def set_default_attn_processor(self):
        """
        Disables custom attention processors and sets the default attention implementation.
        """
-        if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+        if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+            processor = AttnAddedKVProcessor()
+        elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
            processor = AttnProcessor()
        else:
            raise ValueError(
@@ -332,7 +451,15 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin, FromOriginalVAEMixin
                The latent representations of the encoded images. If `return_dict` is True, a
                [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
        """
-        h = self.encoder(x)
+        if self.use_tiling and (x.shape[-1] > self.tile_sample_min_size or x.shape[-2] > self.tile_sample_min_size):
+            return self.tiled_encode(x, return_dict=return_dict)
+
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self.encoder(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self.encoder(x)
+
        moments = self.quant_conv(h)
        posterior = DiagonalGaussianDistribution(moments)

@@ -341,12 +468,29 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin, FromOriginalVAEMixin

        return AutoencoderKLOutput(latent_dist=posterior)

+    def _decode(
+        self, z: torch.FloatTensor, num_frames: int, return_dict: bool = True
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        if self.use_tiling and (z.shape[-1] > self.tile_latent_min_size or z.shape[-2] > self.tile_latent_min_size):
+            return self.tiled_decode(z, return_dict=return_dict)
+
+        batch_size = z.shape[0] // num_frames
+        # TODO: dont hardcode this
+        image_only_indicator = torch.zeros(batch_size, num_frames, dtype=z.dtype, device=z.device)
+        dec = self.decoder(z, num_frames=num_frames, image_only_indicator=image_only_indicator)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
    @apply_forward_hook
    def decode(
        self,
        z: torch.FloatTensor,
        num_frames: int,
        return_dict: bool = True,
+        generator=None,
    ) -> Union[DecoderOutput, torch.FloatTensor]:
        """
        Decode a batch of images.
@@ -362,15 +506,141 @@ class AutoencoderKLTemporalDecoder(ModelMixin, ConfigMixin, FromOriginalVAEMixin
                returned.

        """
-        batch_size = z.shape[0] // num_frames
-        image_only_indicator = torch.zeros(batch_size, num_frames, dtype=z.dtype, device=z.device)
-        decoded = self.decoder(z, num_frames=num_frames, image_only_indicator=image_only_indicator)
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice, num_frames // 2).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z, num_frames).sample

        if not return_dict:
            return (decoded,)

        return DecoderOutput(sample=decoded)

+    def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[2], b.shape[2], blend_extent)
+        for y in range(blend_extent):
+            b[:, :, y, :] = a[:, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, y, :] * (y / blend_extent)
+        return b
+
+    def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
+        blend_extent = min(a.shape[3], b.shape[3], blend_extent)
+        for x in range(blend_extent):
+            b[:, :, :, x] = a[:, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, x] * (x / blend_extent)
+        return b
+
+    def tiled_encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
+        r"""Encode a batch of images using a tiled encoder.
+
+        When this option is enabled, the VAE will split the input tensor into tiles to compute encoding in several
+        steps. This is useful to keep memory use constant regardless of image size. The end result of tiled encoding is
+        different from non-tiled encoding because each tile uses a different encoder. To avoid tiling artifacts, the
+        tiles overlap and are blended together to form a smooth output. You may still see tile-sized changes in the
+        output, but they should be much less noticeable.
+
+        Args:
+            x (`torch.FloatTensor`): Input batch of images.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.autoencoder_kl.AutoencoderKLOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.autoencoder_kl.AutoencoderKLOutput`] or `tuple`:
+                If return_dict is True, a [`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain
+                `tuple` is returned.
+        """
+        overlap_size = int(self.tile_sample_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_latent_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_latent_min_size - blend_extent
+
+        # Split the image into 512x512 tiles and encode them separately.
+        rows = []
+        for i in range(0, x.shape[2], overlap_size):
+            row = []
+            for j in range(0, x.shape[3], overlap_size):
+                tile = x[
+                    :,
+                    :,
+                    i : i + self.tile_sample_min_size,
+                    j : j + self.tile_sample_min_size,
+                ]
+                tile = self.encoder(tile)
+                tile = self.quant_conv(tile)
+                row.append(tile)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+
+        moments = torch.cat(result_rows, dim=2)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def tiled_decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Decode a batch of images using a tiled decoder.
+
+        Args:
+            z (`torch.FloatTensor`): Input batch of latent vectors.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
+
+        Returns:
+            [`~models.vae.DecoderOutput`] or `tuple`:
+                If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
+                returned.
+        """
+        overlap_size = int(self.tile_latent_min_size * (1 - self.tile_overlap_factor))
+        blend_extent = int(self.tile_sample_min_size * self.tile_overlap_factor)
+        row_limit = self.tile_sample_min_size - blend_extent
+
+        # Split z into overlapping 64x64 tiles and decode them separately.
+        # The tiles have an overlap to avoid seams between tiles.
+        rows = []
+        for i in range(0, z.shape[2], overlap_size):
+            row = []
+            for j in range(0, z.shape[3], overlap_size):
+                tile = z[
+                    :,
+                    :,
+                    i : i + self.tile_latent_min_size,
+                    j : j + self.tile_latent_min_size,
+                ]
+                tile = self.post_quant_conv(tile)
+                decoded = self.decoder(tile)
+                row.append(decoded)
+            rows.append(row)
+        result_rows = []
+        for i, row in enumerate(rows):
+            result_row = []
+            for j, tile in enumerate(row):
+                # blend the above tile and the left tile
+                # to the current tile and add the current tile to the result row
+                if i > 0:
+                    tile = self.blend_v(rows[i - 1][j], tile, blend_extent)
+                if j > 0:
+                    tile = self.blend_h(row[j - 1], tile, blend_extent)
+                result_row.append(tile[:, :, :row_limit, :row_limit])
+            result_rows.append(torch.cat(result_row, dim=3))
+
+        dec = torch.cat(result_rows, dim=2)
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
    def forward(
        self,
        sample: torch.FloatTensor,
@@ -30,7 +30,12 @@ from .attention_processor import (
 )
 from .embeddings import TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
-from .unet_2d_blocks import CrossAttnDownBlock2D, DownBlock2D, UNetMidBlock2D, UNetMidBlock2DCrossAttn, get_down_block
+from .unet_2d_blocks import (
+    CrossAttnDownBlock2D,
+    DownBlock2D,
+    UNetMidBlock2DCrossAttn,
+    get_down_block,
+)
 from .unet_2d_condition import UNet2DConditionModel


@@ -186,7 +191,6 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
            "CrossAttnDownBlock2D",
            "DownBlock2D",
        ),
-        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
        only_cross_attention: Union[bool, Tuple[bool]] = False,
        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
        layers_per_block: int = 2,
@@ -405,35 +409,20 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
        controlnet_block = zero_module(controlnet_block)
        self.controlnet_mid_block = controlnet_block

-        if mid_block_type == "UNetMidBlock2DCrossAttn":
-            self.mid_block = UNetMidBlock2DCrossAttn(
-                transformer_layers_per_block=transformer_layers_per_block[-1],
-                in_channels=mid_block_channel,
-                temb_channels=time_embed_dim,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                output_scale_factor=mid_block_scale_factor,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                cross_attention_dim=cross_attention_dim,
-                num_attention_heads=num_attention_heads[-1],
-                resnet_groups=norm_num_groups,
-                use_linear_projection=use_linear_projection,
-                upcast_attention=upcast_attention,
-            )
-        elif mid_block_type == "UNetMidBlock2D":
-            self.mid_block = UNetMidBlock2D(
-                in_channels=block_out_channels[-1],
-                temb_channels=time_embed_dim,
-                num_layers=0,
-                resnet_eps=norm_eps,
-                resnet_act_fn=act_fn,
-                output_scale_factor=mid_block_scale_factor,
-                resnet_groups=norm_num_groups,
-                resnet_time_scale_shift=resnet_time_scale_shift,
-                add_attention=False,
-            )
-        else:
-            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        self.mid_block = UNetMidBlock2DCrossAttn(
+            transformer_layers_per_block=transformer_layers_per_block[-1],
+            in_channels=mid_block_channel,
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            cross_attention_dim=cross_attention_dim,
+            num_attention_heads=num_attention_heads[-1],
+            resnet_groups=norm_num_groups,
+            use_linear_projection=use_linear_projection,
+            upcast_attention=upcast_attention,
+        )

    @classmethod
    def from_unet(
@@ -442,7 +431,6 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
        controlnet_conditioning_channel_order: str = "rgb",
        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
        load_weights_from_unet: bool = True,
-        conditioning_channels: int = 3,
    ):
        r"""
        Instantiate a [`ControlNetModel`] from [`UNet2DConditionModel`].
@@ -489,10 +477,8 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):
            upcast_attention=unet.config.upcast_attention,
            resnet_time_scale_shift=unet.config.resnet_time_scale_shift,
            projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim,
-            mid_block_type=unet.config.mid_block_type,
            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
-            conditioning_channels=conditioning_channels,
        )

        if load_weights_from_unet:
@@ -811,16 +797,13 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlnetMixin):

        # 4. mid
        if self.mid_block is not None:
-            if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention:
-                sample = self.mid_block(
-                    sample,
-                    emb,
-                    encoder_hidden_states=encoder_hidden_states,
-                    attention_mask=attention_mask,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                )
-            else:
-                sample = self.mid_block(sample, emb)
+            sample = self.mid_block(
+                sample,
+                emb,
+                encoder_hidden_states=encoder_hidden_states,
+                attention_mask=attention_mask,
+                cross_attention_kwargs=cross_attention_kwargs,
+            )

        # 5. Control net blocks

@@ -20,7 +20,6 @@ from torch import nn

 from ..utils import USE_PEFT_BACKEND
 from .activations import get_activation
-from .attention_processor import Attention
 from .lora import LoRACompatibleLinear


@@ -791,91 +790,3 @@ class CaptionProjection(nn.Module):
        hidden_states = self.act_1(hidden_states)
        hidden_states = self.linear_2(hidden_states)
        return hidden_states
-
-
-class Resampler(nn.Module):
-    """Resampler of IP-Adapter Plus.
-
-    Args:
-    ----
-        embed_dims (int): The feature dimension. Defaults to 768.
-        output_dims (int): The number of output channels, that is the same
-            number of the channels in the
-            `unet.config.cross_attention_dim`. Defaults to 1024.
-        hidden_dims (int): The number of hidden channels. Defaults to 1280.
-        depth (int): The number of blocks. Defaults to 8.
-        dim_head (int): The number of head channels. Defaults to 64.
-        heads (int): Parallel attention heads. Defaults to 16.
-        num_queries (int): The number of queries. Defaults to 8.
-        ffn_ratio (float): The expansion ratio of feedforward network hidden
-            layer channels. Defaults to 4.
-    """
-
-    def __init__(
-        self,
-        embed_dims: int = 768,
-        output_dims: int = 1024,
-        hidden_dims: int = 1280,
-        depth: int = 4,
-        dim_head: int = 64,
-        heads: int = 16,
-        num_queries: int = 8,
-        ffn_ratio: float = 4,
-    ) -> None:
-        super().__init__()
-        from .attention import FeedForward  # Lazy import to avoid circular import
-
-        self.latents = nn.Parameter(torch.randn(1, num_queries, hidden_dims) / hidden_dims**0.5)
-
-        self.proj_in = nn.Linear(embed_dims, hidden_dims)
-
-        self.proj_out = nn.Linear(hidden_dims, output_dims)
-        self.norm_out = nn.LayerNorm(output_dims)
-
-        self.layers = nn.ModuleList([])
-        for _ in range(depth):
-            self.layers.append(
-                nn.ModuleList(
-                    [
-                        nn.LayerNorm(hidden_dims),
-                        nn.LayerNorm(hidden_dims),
-                        Attention(
-                            query_dim=hidden_dims,
-                            dim_head=dim_head,
-                            heads=heads,
-                            out_bias=False,
-                        ),
-                        nn.Sequential(
-                            nn.LayerNorm(hidden_dims),
-                            FeedForward(hidden_dims, hidden_dims, activation_fn="gelu", mult=ffn_ratio, bias=False),
-                        ),
-                    ]
-                )
-            )
-
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
-        """Forward pass.
-
-        Args:
-        ----
-            x (torch.Tensor): Input Tensor.
-
-        Returns:
-        -------
-            torch.Tensor: Output Tensor.
-        """
-        latents = self.latents.repeat(x.size(0), 1, 1)
-
-        x = self.proj_in(x)
-
-        for ln0, ln1, attn, ff in self.layers:
-            residual = latents
-
-            encoder_hidden_states = ln0(x)
-            latents = ln1(latents)
-            encoder_hidden_states = torch.cat([encoder_hidden_states, latents], dim=-2)
-            latents = attn(latents, encoder_hidden_states) + residual
-            latents = ff(latents) + latents
-
-        latents = self.proj_out(latents)
-        return self.norm_out(latents)
@@ -1,17 +0,0 @@
-from dataclasses import dataclass
-
-from ..utils import BaseOutput
-
-
-@dataclass
-class AutoencoderKLOutput(BaseOutput):
-    """
-    Output of AutoencoderKL encoding method.
-
-    Args:
-        latent_dist (`DiagonalGaussianDistribution`):
-            Encoded outputs of `Encoder` represented as the mean and logvar of `DiagonalGaussianDistribution`.
-            `DiagonalGaussianDistribution` allows for sampling latents from the distribution.
-    """
-
-    latent_dist: "DiagonalGaussianDistribution"  # noqa: F821
@@ -1118,27 +1118,64 @@ class TemporalResnetBlock(nn.Module):
        in_channels (`int`): The number of channels in the input.
        out_channels (`int`, *optional*, default to be `None`):
            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
+        dropout (`float`, *optional*, defaults to `0.0`): The dropout probability to use.
        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
+        groups (`int`, *optional*, default to `32`): The number of groups to use for the first normalization layer.
+        groups_out (`int`, *optional*, default to None):
+            The number of groups to use for the second normalization layer. if set to None, same as `groups`.
        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the normalization.
+        non_linearity (`str`, *optional*, default to `"swish"`): the activation function to use.
+        time_embedding_norm (`str`, *optional*, default to `"default"` ): Time scale shift config.
+            By default, apply timestep embedding conditioning with a simple shift mechanism. Choose "scale_shift" or
+            "ada_group" for a stronger conditioning with scale and shift.
+        kernel (`torch.FloatTensor`, optional, default to None): FIR filter, see
+            [`~models.resnet.FirUpsample2D`] and [`~models.resnet.FirDownsample2D`].
+        output_scale_factor (`float`, *optional*, default to be `1.0`): the scale factor to use for the output.
+        use_in_shortcut (`bool`, *optional*, default to `True`):
+            If `True`, add a 1x1 nn.conv2d layer for skip-connection.
+        up (`bool`, *optional*, default to `False`): If `True`, add an upsample layer.
+        down (`bool`, *optional*, default to `False`): If `True`, add a downsample layer.
+        conv_shortcut_bias (`bool`, *optional*, default to `True`):  If `True`, adds a learnable bias to the
+            `conv_shortcut` output.
+        conv_2d_out_channels (`int`, *optional*, default to `None`): the number of channels in the output.
+            If None, same as `out_channels`.
    """

    def __init__(
        self,
        in_channels: int,
        out_channels: Optional[int] = None,
+        conv_shortcut: bool = False,
+        dropout: float = 0.0,
        temb_channels: int = 512,
+        groups: int = 32,
+        groups_out: Optional[int] = None,
        eps: float = 1e-6,
+        non_linearity: str = "swish",
+        kernel_size: Optional[torch.FloatTensor] = (3, 1, 1),
+        output_scale_factor: float = 1.0,
+        use_in_shortcut: Optional[bool] = None,
+        conv_shortcut_bias: bool = True,
+        conv_2d_out_channels: Optional[int] = None,
    ):
        super().__init__()
        self.in_channels = in_channels
        out_channels = in_channels if out_channels is None else out_channels
        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.output_scale_factor = output_scale_factor
+
+        linear_cls = nn.Linear
+        conv_cls = nn.Conv3d

-        kernel_size = (3, 1, 1)
        padding = [k // 2 for k in kernel_size]

-        self.norm1 = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=eps, affine=True)
-        self.conv1 = nn.Conv3d(
+        if groups_out is None:
+            groups_out = groups
+
+        self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+
+        self.conv1 = conv_cls(
            in_channels,
            out_channels,
            kernel_size=kernel_size,
@@ -1147,89 +1184,86 @@ class TemporalResnetBlock(nn.Module):
        )

        if temb_channels is not None:
-            self.time_emb_proj = nn.Linear(temb_channels, out_channels)
+            self.time_emb_proj = linear_cls(temb_channels, out_channels)
        else:
            self.time_emb_proj = None

-        self.norm2 = torch.nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=eps, affine=True)
+        self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)

-        self.dropout = torch.nn.Dropout(0.0)
-        self.conv2 = nn.Conv3d(
-            out_channels,
+        self.dropout = torch.nn.Dropout(dropout)
+        conv_2d_out_channels = conv_2d_out_channels or out_channels
+        self.conv2 = conv_cls(
            out_channels,
+            conv_2d_out_channels,
            kernel_size=kernel_size,
            stride=1,
            padding=padding,
        )

-        self.nonlinearity = get_activation("silu")
+        self.nonlinearity = get_activation(non_linearity)

-        self.use_in_shortcut = self.in_channels != out_channels
+        self.use_in_shortcut = self.in_channels != conv_2d_out_channels if use_in_shortcut is None else use_in_shortcut

        self.conv_shortcut = None
        if self.use_in_shortcut:
-            self.conv_shortcut = nn.Conv3d(
+            self.conv_shortcut = conv_cls(
                in_channels,
-                out_channels,
+                conv_2d_out_channels,
                kernel_size=1,
                stride=1,
                padding=0,
+                bias=conv_shortcut_bias,
            )

    def forward(self, input_tensor: torch.FloatTensor, temb: torch.FloatTensor) -> torch.FloatTensor:
        hidden_states = input_tensor

        hidden_states = self.norm1(hidden_states)
-        hidden_states = self.nonlinearity(hidden_states)
-        hidden_states = self.conv1(hidden_states)

+        hidden_states = self.nonlinearity(hidden_states)
+
+        hidden_states = self.conv1(hidden_states)
        if self.time_emb_proj is not None:
            temb = self.nonlinearity(temb)
            temb = self.time_emb_proj(temb)[:, :, :, None, None]
+
+        if temb is not None:
            temb = temb.permute(0, 2, 1, 3, 4)
            hidden_states = hidden_states + temb

        hidden_states = self.norm2(hidden_states)
+
        hidden_states = self.nonlinearity(hidden_states)
+
        hidden_states = self.dropout(hidden_states)
        hidden_states = self.conv2(hidden_states)

        if self.conv_shortcut is not None:
            input_tensor = self.conv_shortcut(input_tensor)

-        output_tensor = input_tensor + hidden_states
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor

        return output_tensor


 # VideoResBlock
 class SpatioTemporalResBlock(nn.Module):
-    r"""
-    A SpatioTemporal Resnet block.
-
-    Parameters:
-        in_channels (`int`): The number of channels in the input.
-        out_channels (`int`, *optional*, default to be `None`):
-            The number of output channels for the first conv2d layer. If None, same as `in_channels`.
-        temb_channels (`int`, *optional*, default to `512`): the number of channels in timestep embedding.
-        eps (`float`, *optional*, defaults to `1e-6`): The epsilon to use for the spatial resenet.
-        temporal_eps (`float`, *optional*, defaults to `eps`): The epsilon to use for the temporal resnet.
-        merge_factor (`float`, *optional*, defaults to `0.5`): The merge factor to use for the temporal mixing.
-        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
-            The merge strategy to use for the temporal mixing.
-        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
-            If `True`, switch the spatial and temporal mixing.
-    """
-
    def __init__(
        self,
        in_channels: int,
        out_channels: Optional[int] = None,
+        dropout: float = 0.0,
        temb_channels: int = 512,
+        groups: int = 32,
+        pre_norm: bool = True,
        eps: float = 1e-6,
        temporal_eps: Optional[float] = None,
+        non_linearity: str = "swish",
+        time_embedding_norm: str = "default",  # default, scale_shift, ada_group, spatial
+        output_scale_factor: float = 1.0,
+        kernel_size_3d: Optional[torch.FloatTensor] = (3, 1, 1),
        merge_factor: float = 0.5,
-        merge_strategy="learned_with_images",
+        merge_strategy="learned",
        switch_spatial_to_temporal_mix: bool = False,
    ):
        super().__init__()
@@ -1239,6 +1273,12 @@ class SpatioTemporalResBlock(nn.Module):
            out_channels=out_channels,
            temb_channels=temb_channels,
            eps=eps,
+            groups=groups,
+            dropout=dropout,
+            time_embedding_norm=time_embedding_norm,
+            non_linearity=non_linearity,
+            output_scale_factor=output_scale_factor,
+            pre_norm=pre_norm,
        )

        self.temporal_res_block = TemporalResnetBlock(
@@ -1246,6 +1286,11 @@ class SpatioTemporalResBlock(nn.Module):
            out_channels=out_channels if out_channels is not None else in_channels,
            temb_channels=temb_channels,
            eps=temporal_eps if temporal_eps is not None else eps,
+            groups=groups,
+            dropout=dropout,
+            non_linearity=non_linearity,
+            output_scale_factor=output_scale_factor,
+            kernel_size=kernel_size_3d,
        )

        self.time_mixer = AlphaBlender(
@@ -1258,10 +1303,11 @@ class SpatioTemporalResBlock(nn.Module):
        self,
        hidden_states: torch.FloatTensor,
        temb: Optional[torch.FloatTensor] = None,
+        num_frames: int = 1,
        image_only_indicator: Optional[torch.Tensor] = None,
+        scale: float = 1.0,
    ):
-        num_frames = image_only_indicator.shape[-1]
-        hidden_states = self.spatial_res_block(hidden_states, temb)
+        hidden_states = self.spatial_res_block(hidden_states, temb, scale=scale)

        batch_frames, channels, height, width = hidden_states.shape
        batch_size = batch_frames // num_frames
@@ -1288,17 +1334,6 @@ class SpatioTemporalResBlock(nn.Module):


 class AlphaBlender(nn.Module):
-    r"""
-    A module to blend spatial and temporal features.
-
-    Parameters:
-        alpha (`float`): The initial value of the blending factor.
-        merge_strategy (`str`, *optional*, defaults to `learned_with_images`):
-            The merge strategy to use for the temporal mixing.
-        switch_spatial_to_temporal_mix (`bool`, *optional*, defaults to `False`):
-            If `True`, switch the spatial and temporal mixing.
-    """
-
    strategies = ["learned", "fixed", "learned_with_images"]

    def __init__(
@@ -1311,8 +1346,7 @@ class AlphaBlender(nn.Module):
        self.merge_strategy = merge_strategy
        self.switch_spatial_to_temporal_mix = switch_spatial_to_temporal_mix  # For TemporalVAE

-        if merge_strategy not in self.strategies:
-            raise ValueError(f"merge_strategy needs to be in {self.strategies}")
+        assert merge_strategy in self.strategies, f"merge_strategy needs to be in {self.strategies}"

        if self.merge_strategy == "fixed":
            self.register_buffer("mix_factor", torch.Tensor([alpha]))
@@ -1329,9 +1363,9 @@ class AlphaBlender(nn.Module):
            alpha = torch.sigmoid(self.mix_factor)

        elif self.merge_strategy == "learned_with_images":
-            if image_only_indicator is None:
-                raise ValueError("Please provide image_only_indicator to use learned_with_images merge strategy")
-
+            assert (
+                image_only_indicator is not None
+            ), "Please provide image_only_indicator to use learned_with_images merge strategy"
            alpha = torch.where(
                image_only_indicator.bool(),
                torch.ones(1, 1, device=image_only_indicator.device),
@@ -199,7 +199,8 @@ class TransformerTemporalModel(ModelMixin, ConfigMixin):
        return TransformerTemporalModelOutput(sample=output)


-class TransformerSpatioTemporalModel(nn.Module):
+# VideoBlock
+class TransformerSpatioTemporalModel(ModelMixin, ConfigMixin):
    """
    A Transformer model for video-like data.

@@ -208,12 +209,27 @@ class TransformerSpatioTemporalModel(nn.Module):
        attention_head_dim (`int`, *optional*, defaults to 88): The number of channels in each head.
        in_channels (`int`, *optional*):
            The number of channels in the input and output (specify if the input is **continuous**).
-        out_channels (`int`, *optional*):
-            The number of channels in the output (specify if the input is **continuous**).
        num_layers (`int`, *optional*, defaults to 1): The number of layers of Transformer blocks to use.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
        cross_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        attention_bias (`bool`, *optional*):
+            Configure if the `TransformerBlock` attention should contain a bias parameter.
+        sample_size (`int`, *optional*): The width of the latent images (specify if the input is **discrete**).
+            This is fixed during training since it is used to learn a number of position embeddings.
+        activation_fn (`str`, *optional*, defaults to `"geglu"`):
+            Activation function to use in feed-forward. See `diffusers.models.activations.get_activation` for supported
+            activation functions.
+        norm_elementwise_affine (`bool`, *optional*):
+            Configure if the `TransformerBlock` should use learnable elementwise affine parameters for normalization.
+        double_self_attention (`bool`, *optional*):
+            Configure if each `TransformerBlock` should contain two self-attention layers.
+        positional_embeddings: (`str`, *optional*):
+            The type of positional embeddings to apply to the sequence input before passing use.
+        num_positional_embeddings: (`int`, *optional*):
+            The maximum length of the sequence over which to apply positional embeddings.
    """

+    @register_to_config
    def __init__(
        self,
        num_attention_heads: int = 16,
@@ -221,7 +237,12 @@ class TransformerSpatioTemporalModel(nn.Module):
        in_channels: int = 320,
        out_channels: Optional[int] = None,
        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
        cross_attention_dim: Optional[int] = None,
+        norm_eps: float = 1e-5,
+        merge_factor: float = 0.5,
+        merge_strategy: str = "learned_with_images",
    ):
        super().__init__()
        self.num_attention_heads = num_attention_heads
@@ -230,10 +251,12 @@ class TransformerSpatioTemporalModel(nn.Module):
        inner_dim = num_attention_heads * attention_head_dim
        self.inner_dim = inner_dim

+        linear_cls = nn.Linear
+
        # 2. Define input layers
        self.in_channels = in_channels
-        self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6)
-        self.proj_in = nn.Linear(in_channels, inner_dim)
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=norm_eps)
+        self.proj_in = linear_cls(in_channels, inner_dim)

        # 3. Define transformers blocks
        self.transformer_blocks = nn.ModuleList(
@@ -242,7 +265,9 @@ class TransformerSpatioTemporalModel(nn.Module):
                    inner_dim,
                    num_attention_heads,
                    attention_head_dim,
+                    dropout=dropout,
                    cross_attention_dim=cross_attention_dim,
+                    norm_eps=norm_eps,
                )
                for d in range(num_layers)
            ]
@@ -256,7 +281,9 @@ class TransformerSpatioTemporalModel(nn.Module):
                    time_mix_inner_dim,
                    num_attention_heads,
                    attention_head_dim,
+                    dropout=dropout,
                    cross_attention_dim=cross_attention_dim,
+                    norm_eps=norm_eps,
                )
                for _ in range(num_layers)
            ]
@@ -265,36 +292,48 @@ class TransformerSpatioTemporalModel(nn.Module):
        time_embed_dim = in_channels * 4
        self.time_pos_embed = TimestepEmbedding(in_channels, time_embed_dim, out_dim=in_channels)
        self.time_proj = Timesteps(in_channels, True, 0)
-        self.time_mixer = AlphaBlender(alpha=0.5, merge_strategy="learned_with_images")
+        self.time_mixer = AlphaBlender(alpha=merge_factor, merge_strategy=merge_strategy)

        # 4. Define output layers
        self.out_channels = in_channels if out_channels is None else out_channels
        # TODO: should use out_channels for continuous projections
-        self.proj_out = nn.Linear(inner_dim, in_channels)
+        self.proj_out = linear_cls(inner_dim, in_channels)

        self.gradient_checkpointing = False

    def forward(
        self,
        hidden_states: torch.Tensor,
+        num_frames: int,
        encoder_hidden_states: Optional[torch.Tensor] = None,
+        timestep: Optional[torch.LongTensor] = None,
+        class_labels: Optional[torch.LongTensor] = None,
+        cross_attention_kwargs: Dict[str, Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
        image_only_indicator: Optional[torch.Tensor] = None,
        return_dict: bool = True,
    ):
        """
        Args:
-            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+            hidden_states (`torch.LongTensor` of shape `(batch size, num latent pixels)` if discrete, `torch.FloatTensor` of shape `(batch size, channel, height, width)` if continuous):
                Input hidden_states.
-            num_frames (`int`):
-                The number of frames to be processed per batch. This is used to reshape the hidden states.
            encoder_hidden_states ( `torch.LongTensor` of shape `(batch size, encoder_hidden_states dim)`, *optional*):
                Conditional embeddings for cross attention layer. If not given, cross-attention defaults to
                self-attention.
-            image_only_indicator (`torch.LongTensor` of shape `(batch size, num_frames)`, *optional*):
-                A tensor indicating whether the input contains only images. 1 indicates that the input contains only
-                images, 0 indicates that the input contains video frames.
+            timestep ( `torch.LongTensor`, *optional*):
+                Used to indicate denoising step. Optional timestep to be applied as an embedding in `AdaLayerNorm`.
+            class_labels ( `torch.LongTensor` of shape `(batch size, num classes)`, *optional*):
+                Used to indicate class labels conditioning. Optional class labels to be applied as an embedding in
+                `AdaLayerZeroNorm`.
+            num_frames (`int`, *optional*, defaults to 1):
+                The number of frames to be processed per batch. This is used to reshape the hidden states.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.transformer_temporal.TransformerTemporalModelOutput`] instead of a plain
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
                tuple.

        Returns:
@@ -302,19 +341,17 @@ class TransformerSpatioTemporalModel(nn.Module):
                If `return_dict` is True, an [`~models.transformer_temporal.TransformerTemporalModelOutput`] is
                returned, otherwise a `tuple` where the first element is the sample tensor.
        """
+        assert (
+            encoder_hidden_states.ndim == 3
+        ), f"n dims of spatial context should be 3 but are {encoder_hidden_states.ndim}"
+
        # 1. Input
-        batch_frames, _, height, width = hidden_states.shape
-        num_frames = image_only_indicator.shape[-1]
+        batch_frames, channel, height, width = hidden_states.shape
        batch_size = batch_frames // num_frames

        time_context = encoder_hidden_states
-        time_context_first_timestep = time_context[None, :].reshape(
-            batch_size, num_frames, -1, time_context.shape[-1]
-        )[:, 0]
-        time_context = time_context_first_timestep[None, :].broadcast_to(
-            height * width, batch_size, 1, time_context.shape[-1]
-        )
-        time_context = time_context.reshape(height * width * batch_size, 1, time_context.shape[-1])
+        time_context_first_timestep = time_context[::num_frames]
+        time_context = time_context_first_timestep.repeat(height * width, 1, 1)

        residual = hidden_states

@@ -345,12 +382,20 @@ class TransformerSpatioTemporalModel(nn.Module):
                    None,
                    encoder_hidden_states,
                    None,
+                    timestep,
+                    cross_attention_kwargs,
+                    class_labels,
                    use_reentrant=False,
                )
            else:
                hidden_states = block(
                    hidden_states,
+                    attention_mask=None,
                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=None,
+                    timestep=timestep,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    class_labels=class_labels,
                )

            hidden_states_mix = hidden_states
@@ -360,6 +405,7 @@ class TransformerSpatioTemporalModel(nn.Module):
                hidden_states_mix,
                num_frames=num_frames,
                encoder_hidden_states=time_context,
+                cross_attention_kwargs=cross_attention_kwargs,
            )
            hidden_states = self.time_mixer(
                x_spatial=hidden_states,
@@ -1,28 +1,16 @@
-# Copyright 2023 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
+import math
 from dataclasses import dataclass
 from typing import Dict, Tuple, Union

 import torch
+import torch.nn.functional as F
 import torch.utils.checkpoint
 from torch import nn

 from ..configuration_utils import ConfigMixin, register_to_config
 from ..utils import BaseOutput, logging
-from .attention_processor import Attention, AttentionProcessor, AttnProcessor
-from .embeddings import TimestepEmbedding, Timesteps
+from .attention_processor import AttentionProcessor, Kandi3AttnProcessor
+from .embeddings import TimestepEmbedding
 from .modeling_utils import ModelMixin


@@ -34,6 +22,36 @@ class Kandinsky3UNetOutput(BaseOutput):
    sample: torch.FloatTensor = None


+# TODO(Yiyi): This class needs to be removed
+def set_default_item(condition, item_1, item_2=None):
+    if condition:
+        return item_1
+    else:
+        return item_2
+
+
+# TODO(Yiyi): This class needs to be removed
+def set_default_layer(condition, layer_1, args_1=[], kwargs_1={}, layer_2=torch.nn.Identity, args_2=[], kwargs_2={}):
+    if condition:
+        return layer_1(*args_1, **kwargs_1)
+    else:
+        return layer_2(*args_2, **kwargs_2)
+
+
+# TODO(Yiyi): This class should be removed and be replaced by Timesteps
+class SinusoidalPosEmb(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.dim = dim
+
+    def forward(self, x, type_tensor=None):
+        half_dim = self.dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, device=x.device) * -emb)
+        emb = x[:, None] * emb[None, :]
+        return torch.cat((emb.sin(), emb.cos()), dim=-1)
+
+
 class Kandinsky3EncoderProj(nn.Module):
    def __init__(self, encoder_hid_dim, cross_attention_dim):
        super().__init__()
@@ -69,7 +87,9 @@ class Kandinsky3UNet(ModelMixin, ConfigMixin):

        out_channels = in_channels
        init_channels = block_out_channels[0] // 2
-        self.time_proj = Timesteps(init_channels, flip_sin_to_cos=False, downscale_freq_shift=1)
+        # TODO(Yiyi): Should be replaced with Timesteps class -> make sure that results are the same
+        # self.time_proj = Timesteps(init_channels, flip_sin_to_cos=False, downscale_freq_shift=1)
+        self.time_proj = SinusoidalPosEmb(init_channels)

        self.time_embedding = TimestepEmbedding(
            init_channels,
@@ -86,7 +106,7 @@ class Kandinsky3UNet(ModelMixin, ConfigMixin):

        hidden_dims = [init_channels] + list(block_out_channels)
        in_out_dims = list(zip(hidden_dims[:-1], hidden_dims[1:]))
-        text_dims = [cross_attention_dim if is_exist else None for is_exist in add_cross_attention]
+        text_dims = [set_default_item(is_exist, cross_attention_dim) for is_exist in add_cross_attention]
        num_blocks = len(block_out_channels) * [layers_per_block]
        layer_params = [num_blocks, text_dims, add_self_attention]
        rev_layer_params = map(reversed, layer_params)
@@ -98,7 +118,7 @@ class Kandinsky3UNet(ModelMixin, ConfigMixin):
            zip(in_out_dims, *layer_params)
        ):
            down_sample = level != (self.num_levels - 1)
-            cat_dims.append(out_dim if level != (self.num_levels - 1) else 0)
+            cat_dims.append(set_default_item(level != (self.num_levels - 1), out_dim, 0))
            self.down_blocks.append(
                Kandinsky3DownSampleBlock(
                    in_dim,
@@ -203,16 +223,18 @@ class Kandinsky3UNet(ModelMixin, ConfigMixin):
        """
        Disables custom attention processors and sets the default attention implementation.
        """
-        self.set_attn_processor(AttnProcessor())
+        self.set_attn_processor(Kandi3AttnProcessor())

    def _set_gradient_checkpointing(self, module, value=False):
        if hasattr(module, "gradient_checkpointing"):
            module.gradient_checkpointing = value

    def forward(self, sample, timestep, encoder_hidden_states=None, encoder_attention_mask=None, return_dict=True):
-        if encoder_attention_mask is not None:
-            encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0
-            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+        # TODO(Yiyi): Clean up the following variables - these names should not be used
+        # but instead only the ones that we pass to forward
+        x = sample
+        context_mask = encoder_attention_mask
+        context = encoder_hidden_states

        if not torch.is_tensor(timestep):
            dtype = torch.float32 if isinstance(timestep, float) else torch.int32
@@ -222,33 +244,33 @@ class Kandinsky3UNet(ModelMixin, ConfigMixin):

        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
        timestep = timestep.expand(sample.shape[0])
-        time_embed_input = self.time_proj(timestep).to(sample.dtype)
+        time_embed_input = self.time_proj(timestep).to(x.dtype)
        time_embed = self.time_embedding(time_embed_input)

-        encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
+        context = self.encoder_hid_proj(context)

-        if encoder_hidden_states is not None:
-            time_embed = self.add_time_condition(time_embed, encoder_hidden_states, encoder_attention_mask)
+        if context is not None:
+            time_embed = self.add_time_condition(time_embed, context, context_mask)

        hidden_states = []
-        sample = self.conv_in(sample)
+        x = self.conv_in(x)
        for level, down_sample in enumerate(self.down_blocks):
-            sample = down_sample(sample, time_embed, encoder_hidden_states, encoder_attention_mask)
+            x = down_sample(x, time_embed, context, context_mask)
            if level != self.num_levels - 1:
-                hidden_states.append(sample)
+                hidden_states.append(x)

        for level, up_sample in enumerate(self.up_blocks):
            if level != 0:
-                sample = torch.cat([sample, hidden_states.pop()], dim=1)
-            sample = up_sample(sample, time_embed, encoder_hidden_states, encoder_attention_mask)
+                x = torch.cat([x, hidden_states.pop()], dim=1)
+            x = up_sample(x, time_embed, context, context_mask)

-        sample = self.conv_norm_out(sample)
-        sample = self.conv_act_out(sample)
-        sample = self.conv_out(sample)
+        x = self.conv_norm_out(x)
+        x = self.conv_act_out(x)
+        x = self.conv_out(x)

        if not return_dict:
-            return (sample,)
-        return Kandinsky3UNetOutput(sample=sample)
+            return (x,)
+        return Kandinsky3UNetOutput(sample=x)


 class Kandinsky3UpSampleBlock(nn.Module):
@@ -268,7 +290,7 @@ class Kandinsky3UpSampleBlock(nn.Module):
        self_attention=True,
    ):
        super().__init__()
-        up_resolutions = [[None, True if up_sample else None, None, None]] + [[None] * 4] * (num_blocks - 1)
+        up_resolutions = [[None, set_default_item(up_sample, True), None, None]] + [[None] * 4] * (num_blocks - 1)
        hidden_channels = (
            [(in_channels + cat_dim, in_channels)]
            + [(in_channels, in_channels)] * (num_blocks - 2)
@@ -281,27 +303,27 @@ class Kandinsky3UpSampleBlock(nn.Module):
        self.self_attention = self_attention
        self.context_dim = context_dim

-        if self_attention:
-            attentions.append(
-                Kandinsky3AttentionBlock(out_channels, time_embed_dim, None, groups, head_dim, expansion_ratio)
+        attentions.append(
+            set_default_layer(
+                self_attention,
+                Kandinsky3AttentionBlock,
+                (out_channels, time_embed_dim, None, groups, head_dim, expansion_ratio),
+                layer_2=nn.Identity,
            )
-        else:
-            attentions.append(nn.Identity())
+        )

        for (in_channel, out_channel), up_resolution in zip(hidden_channels, up_resolutions):
            resnets_in.append(
                Kandinsky3ResNetBlock(in_channel, in_channel, time_embed_dim, groups, compression_ratio, up_resolution)
            )
-
-            if context_dim is not None:
-                attentions.append(
-                    Kandinsky3AttentionBlock(
-                        in_channel, time_embed_dim, context_dim, groups, head_dim, expansion_ratio
-                    )
+            attentions.append(
+                set_default_layer(
+                    context_dim is not None,
+                    Kandinsky3AttentionBlock,
+                    (in_channel, time_embed_dim, context_dim, groups, head_dim, expansion_ratio),
+                    layer_2=nn.Identity,
                )
-            else:
-                attentions.append(nn.Identity())
-
+            )
            resnets_out.append(
                Kandinsky3ResNetBlock(in_channel, out_channel, time_embed_dim, groups, compression_ratio)
            )
@@ -345,29 +367,29 @@ class Kandinsky3DownSampleBlock(nn.Module):
        self.self_attention = self_attention
        self.context_dim = context_dim

-        if self_attention:
-            attentions.append(
-                Kandinsky3AttentionBlock(in_channels, time_embed_dim, None, groups, head_dim, expansion_ratio)
+        attentions.append(
+            set_default_layer(
+                self_attention,
+                Kandinsky3AttentionBlock,
+                (in_channels, time_embed_dim, None, groups, head_dim, expansion_ratio),
+                layer_2=nn.Identity,
            )
-        else:
-            attentions.append(nn.Identity())
+        )

-        up_resolutions = [[None] * 4] * (num_blocks - 1) + [[None, None, False if down_sample else None, None]]
+        up_resolutions = [[None] * 4] * (num_blocks - 1) + [[None, None, set_default_item(down_sample, False), None]]
        hidden_channels = [(in_channels, out_channels)] + [(out_channels, out_channels)] * (num_blocks - 1)
        for (in_channel, out_channel), up_resolution in zip(hidden_channels, up_resolutions):
            resnets_in.append(
                Kandinsky3ResNetBlock(in_channel, out_channel, time_embed_dim, groups, compression_ratio)
            )
-
-            if context_dim is not None:
-                attentions.append(
-                    Kandinsky3AttentionBlock(
-                        out_channel, time_embed_dim, context_dim, groups, head_dim, expansion_ratio
-                    )
+            attentions.append(
+                set_default_layer(
+                    context_dim is not None,
+                    Kandinsky3AttentionBlock,
+                    (out_channel, time_embed_dim, context_dim, groups, head_dim, expansion_ratio),
+                    layer_2=nn.Identity,
                )
-            else:
-                attentions.append(nn.Identity())
-
+            )
            resnets_out.append(
                Kandinsky3ResNetBlock(
                    out_channel, out_channel, time_embed_dim, groups, compression_ratio, up_resolution
@@ -409,23 +431,68 @@ class Kandinsky3ConditionalGroupNorm(nn.Module):
        return x


+# TODO(Yiyi): This class should ideally not even exist, it slows everything needlessly down. I'm pretty
+# sure we can delete it and instead just pass an attention_mask
+class Attention(nn.Module):
+    def __init__(self, in_channels, out_channels, context_dim, head_dim=64):
+        super().__init__()
+        assert out_channels % head_dim == 0
+        self.num_heads = out_channels // head_dim
+        self.scale = head_dim**-0.5
+
+        # to_q
+        self.to_q = nn.Linear(in_channels, out_channels, bias=False)
+        # to_k
+        self.to_k = nn.Linear(context_dim, out_channels, bias=False)
+        # to_v
+        self.to_v = nn.Linear(context_dim, out_channels, bias=False)
+        processor = Kandi3AttnProcessor()
+        self.set_processor(processor)
+        # to_out
+        self.to_out = nn.ModuleList([])
+        self.to_out.append(nn.Linear(out_channels, out_channels, bias=False))
+
+    def set_processor(self, processor: "AttnProcessor"):  # noqa: F821
+        # if current processor is in `self._modules` and if passed `processor` is not, we need to
+        # pop `processor` from `self._modules`
+        if (
+            hasattr(self, "processor")
+            and isinstance(self.processor, torch.nn.Module)
+            and not isinstance(processor, torch.nn.Module)
+        ):
+            logger.info(f"You are removing possibly trained weights of {self.processor} with {processor}")
+            self._modules.pop("processor")
+
+        self.processor = processor
+
+    def forward(self, x, context, context_mask=None, image_mask=None):
+        return self.processor(
+            self,
+            x,
+            context=context,
+            context_mask=context_mask,
+        )
+
+
 class Kandinsky3Block(nn.Module):
    def __init__(self, in_channels, out_channels, time_embed_dim, kernel_size=3, norm_groups=32, up_resolution=None):
        super().__init__()
        self.group_norm = Kandinsky3ConditionalGroupNorm(norm_groups, in_channels, time_embed_dim)
        self.activation = nn.SiLU()
-        if up_resolution is not None and up_resolution:
-            self.up_sample = nn.ConvTranspose2d(in_channels, in_channels, kernel_size=2, stride=2)
-        else:
-            self.up_sample = nn.Identity()
-
+        self.up_sample = set_default_layer(
+            up_resolution is not None and up_resolution,
+            nn.ConvTranspose2d,
+            (in_channels, in_channels),
+            {"kernel_size": 2, "stride": 2},
+        )
        padding = int(kernel_size > 1)
        self.projection = nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, padding=padding)
-
-        if up_resolution is not None and not up_resolution:
-            self.down_sample = nn.Conv2d(out_channels, out_channels, kernel_size=2, stride=2)
-        else:
-            self.down_sample = nn.Identity()
+        self.down_sample = set_default_layer(
+            up_resolution is not None and not up_resolution,
+            nn.Conv2d,
+            (out_channels, out_channels),
+            {"kernel_size": 2, "stride": 2},
+        )

    def forward(self, x, time_embed):
        x = self.group_norm(x, time_embed)
@@ -454,18 +521,14 @@ class Kandinsky3ResNetBlock(nn.Module):
                )
            ]
        )
-        self.shortcut_up_sample = (
-            nn.ConvTranspose2d(in_channels, in_channels, kernel_size=2, stride=2)
-            if True in up_resolutions
-            else nn.Identity()
+        self.shortcut_up_sample = set_default_layer(
+            True in up_resolutions, nn.ConvTranspose2d, (in_channels, in_channels), {"kernel_size": 2, "stride": 2}
        )
-        self.shortcut_projection = (
-            nn.Conv2d(in_channels, out_channels, kernel_size=1) if in_channels != out_channels else nn.Identity()
+        self.shortcut_projection = set_default_layer(
+            in_channels != out_channels, nn.Conv2d, (in_channels, out_channels), {"kernel_size": 1}
        )
-        self.shortcut_down_sample = (
-            nn.Conv2d(out_channels, out_channels, kernel_size=2, stride=2)
-            if False in up_resolutions
-            else nn.Identity()
+        self.shortcut_down_sample = set_default_layer(
+            False in up_resolutions, nn.Conv2d, (out_channels, out_channels), {"kernel_size": 2, "stride": 2}
        )

    def forward(self, x, time_embed):
@@ -483,16 +546,9 @@ class Kandinsky3ResNetBlock(nn.Module):
 class Kandinsky3AttentionPooling(nn.Module):
    def __init__(self, num_channels, context_dim, head_dim=64):
        super().__init__()
-        self.attention = Attention(
-            context_dim,
-            context_dim,
-            dim_head=head_dim,
-            out_dim=num_channels,
-            out_bias=False,
-        )
+        self.attention = Attention(context_dim, num_channels, context_dim, head_dim)

    def forward(self, x, context, context_mask=None):
-        context_mask = context_mask.to(dtype=context.dtype)
        context = self.attention(context.mean(dim=1, keepdim=True), context, context_mask)
        return x + context.squeeze(1)

@@ -501,13 +557,7 @@ class Kandinsky3AttentionBlock(nn.Module):
    def __init__(self, num_channels, time_embed_dim, context_dim=None, norm_groups=32, head_dim=64, expansion_ratio=4):
        super().__init__()
        self.in_norm = Kandinsky3ConditionalGroupNorm(norm_groups, num_channels, time_embed_dim)
-        self.attention = Attention(
-            num_channels,
-            context_dim or num_channels,
-            dim_head=head_dim,
-            out_dim=num_channels,
-            out_bias=False,
-        )
+        self.attention = Attention(num_channels, num_channels, context_dim or num_channels, head_dim)

        hidden_channels = expansion_ratio * num_channels
        self.out_norm = Kandinsky3ConditionalGroupNorm(norm_groups, num_channels, time_embed_dim)
@@ -522,10 +572,14 @@ class Kandinsky3AttentionBlock(nn.Module):
        out = self.in_norm(x, time_embed)
        out = out.reshape(x.shape[0], -1, height * width).permute(0, 2, 1)
        context = context if context is not None else out
-        if context_mask is not None:
-            context_mask = context_mask.to(dtype=context.dtype)

-        out = self.attention(out, context, context_mask)
+        if image_mask is not None:
+            mask_height, mask_width = image_mask.shape[-2:]
+            kernel_size = (mask_height // height, mask_width // width)
+            image_mask = F.max_pool2d(image_mask, kernel_size, kernel_size)
+            image_mask = image_mask.reshape(image_mask.shape[0], -1)
+
+        out = self.attention(out, context, context_mask, image_mask)
        out = out.permute(0, 2, 1).unsqueeze(-1).reshape(out.shape[0], -1, height, width)
        x = x + out

@@ -1,13 +1,26 @@
 from dataclasses import dataclass
-from typing import Dict, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Tuple, Union

 import torch
 import torch.nn as nn

 from ..configuration_utils import ConfigMixin, register_to_config
 from ..loaders import UNet2DConditionLoadersMixin
-from ..utils import BaseOutput, logging
-from .attention_processor import CROSS_ATTENTION_PROCESSORS, AttentionProcessor, AttnProcessor
+from ..utils import (
+    USE_PEFT_BACKEND,
+    BaseOutput,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from .activations import get_activation
+from .attention_processor import (
+    ADDED_KV_ATTENTION_PROCESSORS,
+    CROSS_ATTENTION_PROCESSORS,
+    AttentionProcessor,
+    AttnAddedKVProcessor,
+    AttnProcessor,
+)
 from .embeddings import TimestepEmbedding, Timesteps
 from .modeling_utils import ModelMixin
 from .unet_3d_blocks import UNetMidBlockSpatioTemporal, get_down_block, get_up_block
@@ -22,16 +35,18 @@ class UNetSpatioTemporalConditionOutput(BaseOutput):
    The output of [`UNetSpatioTemporalConditionModel`].

    Args:
-        sample (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+        sample (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            The hidden states output conditioned on `encoder_hidden_states` input. Output of last layer of model.
    """

    sample: torch.FloatTensor = None


-class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
+class UNetSpatioTemporalConditionModel(
+    ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin
+):
    r"""
-    A conditional Spatio-Temporal UNet model that takes a noisy video frames, conditional state, and a timestep and returns a sample
+    A conditional 2D UNet model that takes a noisy sample, conditional state, and a timestep and returns a sample
    shaped output.

    This model inherits from [`ModelMixin`]. Check the superclass documentation for it's generic methods implemented
@@ -40,28 +55,87 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
    Parameters:
        sample_size (`int` or `Tuple[int, int]`, *optional*, defaults to `None`):
            Height and width of input/output sample.
-        in_channels (`int`, *optional*, defaults to 8): Number of channels in the input sample.
+        in_channels (`int`, *optional*, defaults to 4): Number of channels in the input sample.
        out_channels (`int`, *optional*, defaults to 4): Number of channels in the output.
-        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "CrossAttnDownBlockSpatioTemporal", "DownBlockSpatioTemporal")`):
+        center_input_sample (`bool`, *optional*, defaults to `False`): Whether to center the input sample.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `False`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
            The tuple of downsample blocks to use.
-        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal", "CrossAttnUpBlockSpatioTemporal")`):
+        mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`):
+            Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or
+            `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped.
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`):
            The tuple of upsample blocks to use.
+        only_cross_attention(`bool` or `Tuple[bool]`, *optional*, default to `False`):
+            Whether to include self-attention in the basic transformer blocks, see
+            [`~models.attention.BasicTransformerBlock`].
        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
            The tuple of output channels for each block.
-        addition_time_embed_dim: (`int`, defaults to 256):
-            Dimension to to encode the additional time ids.
-        projection_class_embeddings_input_dim (`int`, defaults to 768):
-            The dimension of the projection of encoded `added_time_ids`.
        layers_per_block (`int`, *optional*, defaults to 2): The number of layers per block.
+        downsample_padding (`int`, *optional*, defaults to 1): The padding to use for the downsampling convolution.
+        mid_block_scale_factor (`float`, *optional*, defaults to 1.0): The scale factor to use for the mid block.
+        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        norm_num_groups (`int`, *optional*, defaults to 32): The number of groups to use for the normalization.
+            If `None`, normalization and activation layers is skipped in post-processing.
+        norm_eps (`float`, *optional*, defaults to 1e-5): The epsilon to use for the normalization.
        cross_attention_dim (`int` or `Tuple[int]`, *optional*, defaults to 1280):
            The dimension of the cross attention features.
        transformer_layers_per_block (`int`, `Tuple[int]`, or `Tuple[Tuple]` , *optional*, defaults to 1):
            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for
-            [`~models.unet_3d_blocks.CrossAttnDownBlockSpatioTemporal`], [`~models.unet_3d_blocks.CrossAttnUpBlockSpatioTemporal`],
-            [`~models.unet_3d_blocks.UNetMidBlockSpatioTemporal`].
-        num_attention_heads (`int`, `Tuple[int]`, defaults to `(5, 10, 10, 20)`):
-            The number of attention heads.
-        dropout (`float`, *optional*, defaults to 0.0): The dropout probability to use.
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+       reverse_transformer_layers_per_block : (`Tuple[Tuple]`, *optional*, defaults to None):
+            The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`], in the upsampling
+            blocks of the U-Net. Only relevant if `transformer_layers_per_block` is of type `Tuple[Tuple]` and for
+            [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`],
+            [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`].
+        encoder_hid_dim (`int`, *optional*, defaults to None):
+            If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim`
+            dimension to `cross_attention_dim`.
+        encoder_hid_dim_type (`str`, *optional*, defaults to `None`):
+            If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text
+            embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`.
+        attention_head_dim (`int`, *optional*, defaults to 8): The dimension of the attention heads.
+        num_attention_heads (`int`, *optional*):
+            The number of attention heads. If not defined, defaults to `attention_head_dim`
+        resnet_time_scale_shift (`str`, *optional*, defaults to `"default"`): Time scale shift config
+            for ResNet blocks (see [`~models.resnet.ResnetBlock2D`]). Choose from `default` or `scale_shift`.
+        class_embed_type (`str`, *optional*, defaults to `None`):
+            The type of class embedding to use which is ultimately summed with the time embeddings. Choose from `None`,
+            `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`.
+        addition_embed_type (`str`, *optional*, defaults to `None`):
+            Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or
+            "text". "text" will use the `TextTimeEmbedding` layer.
+        addition_time_embed_dim: (`int`, *optional*, defaults to `None`):
+            Dimension for the timestep embeddings.
+        num_class_embeds (`int`, *optional*, defaults to `None`):
+            Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing
+            class conditioning with `class_embed_type` equal to `None`.
+        time_embedding_type (`str`, *optional*, defaults to `positional`):
+            The type of position embedding to use for timesteps. Choose from `positional` or `fourier`.
+        time_embedding_dim (`int`, *optional*, defaults to `None`):
+            An optional override for the dimension of the projected time embedding.
+        time_embedding_act_fn (`str`, *optional*, defaults to `None`):
+            Optional activation function to use only once on the time embeddings before they are passed to the rest of
+            the UNet. Choose from `silu`, `mish`, `gelu`, and `swish`.
+        timestep_post_act (`str`, *optional*, defaults to `None`):
+            The second activation function to use in timestep embedding. Choose from `silu`, `mish` and `gelu`.
+        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
+            The dimension of `cond_proj` layer in the timestep embedding.
+        conv_in_kernel (`int`, *optional*, default to `3`): The kernel size of `conv_in` layer. conv_out_kernel (`int`,
+        *optional*, default to `3`): The kernel size of `conv_out` layer. projection_class_embeddings_input_dim (`int`,
+        *optional*): The dimension of the `class_labels` input when
+            `class_embed_type="projection"`. Required when `class_embed_type="projection"`.
+        class_embeddings_concat (`bool`, *optional*, defaults to `False`): Whether to concatenate the time
+            embeddings with the class embeddings.
+        mid_block_only_cross_attention (`bool`, *optional*, defaults to `None`):
+            Whether to use cross attention with the mid block when using the `UNetMidBlock2DSimpleCrossAttn`. If
+            `only_cross_attention` is given as a single boolean and `mid_block_only_cross_attention` is `None`, the
+            `only_cross_attention` value is used as the value for `mid_block_only_cross_attention`. Default to `False`
+            otherwise.
    """

    _supports_gradient_checkpointing = True
@@ -72,12 +146,15 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
        sample_size: Optional[int] = None,
        in_channels: int = 8,
        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
        down_block_types: Tuple[str] = (
            "CrossAttnDownBlockSpatioTemporal",
            "CrossAttnDownBlockSpatioTemporal",
            "CrossAttnDownBlockSpatioTemporal",
            "DownBlockSpatioTemporal",
        ),
+        mid_block_type: Optional[str] = "UNetMidBlockSpatioTemporal",
        up_block_types: Tuple[str] = (
            "UpBlockSpatioTemporal",
            "CrossAttnUpBlockSpatioTemporal",
@@ -85,18 +162,42 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
            "CrossAttnUpBlockSpatioTemporal",
        ),
        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
-        addition_time_embed_dim: int = 256,
        projection_class_embeddings_input_dim: int = 768,
+        addition_time_embed_dim: int = 256,
        layers_per_block: Union[int, Tuple[int]] = 2,
+        mid_block_scale_factor: float = 1,
+        dropout: float = 0.0,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
        cross_attention_dim: Union[int, Tuple[int]] = 1024,
        transformer_layers_per_block: Union[int, Tuple[int], Tuple[Tuple]] = 1,
-        num_attention_heads: Union[int, Tuple[int]] = (5, 10, 10, 20),
-        num_frames: int = 25,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
+        time_embedding_dim: Optional[int] = None,
+        conv_in_kernel: int = 3,
+        conv_out_kernel: int = 3,
+        kernel_size_3d: Optional[torch.FloatTensor] = (3, 1, 1),
+        merge_factor: float = 0.5,
+        merge_strategy: str = "learned_with_images",
    ):
        super().__init__()

        self.sample_size = sample_size

+        if num_attention_heads is not None:
+            raise ValueError(
+                "At the moment it is not possible to define the number of attention heads via `num_attention_heads` because of a naming issue as described in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131. Passing `num_attention_heads` will only be supported in diffusers v0.19."
+            )
+
+        # If `num_attention_heads` is not defined (which is the case for most models)
+        # it will default to `attention_head_dim`. This looks weird upon first reading it and it is.
+        # The reason for this behavior is to correct for incorrectly named variables that were introduced
+        # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131
+        # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking
+        # which is why we correct for the naming here.
+        num_attention_heads = num_attention_heads or attention_head_dim
+
        # Check inputs
        if len(down_block_types) != len(up_block_types):
            raise ValueError(
@@ -108,39 +209,63 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
                f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}."
            )

-        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types):
+        if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(
+            down_block_types
+        ):
            raise ValueError(
                f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}."
            )

-        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(down_block_types):
+        if not isinstance(attention_head_dim, int) and len(attention_head_dim) != len(
+            down_block_types
+        ):
+            raise ValueError(
+                f"Must provide the same number of `attention_head_dim` as `down_block_types`. `attention_head_dim`: {attention_head_dim}. `down_block_types`: {down_block_types}."
+            )
+
+        if isinstance(cross_attention_dim, list) and len(cross_attention_dim) != len(
+            down_block_types
+        ):
            raise ValueError(
                f"Must provide the same number of `cross_attention_dim` as `down_block_types`. `cross_attention_dim`: {cross_attention_dim}. `down_block_types`: {down_block_types}."
            )

-        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(down_block_types):
+        if not isinstance(layers_per_block, int) and len(layers_per_block) != len(
+            down_block_types
+        ):
            raise ValueError(
                f"Must provide the same number of `layers_per_block` as `down_block_types`. `layers_per_block`: {layers_per_block}. `down_block_types`: {down_block_types}."
            )

        # input
+        conv_in_padding = (conv_in_kernel - 1) // 2
        self.conv_in = nn.Conv2d(
            in_channels,
            block_out_channels[0],
-            kernel_size=3,
-            padding=1,
+            kernel_size=conv_in_kernel,
+            padding=conv_in_padding,
        )

        # time
-        time_embed_dim = block_out_channels[0] * 4
+        time_embed_dim = time_embedding_dim or block_out_channels[0] * 4

-        self.time_proj = Timesteps(block_out_channels[0], True, downscale_freq_shift=0)
+        self.time_proj = Timesteps(
+            block_out_channels[0], flip_sin_to_cos, downscale_freq_shift=0
+        )
        timestep_input_dim = block_out_channels[0]

-        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        self.time_embedding = TimestepEmbedding(
+            timestep_input_dim,
+            time_embed_dim,
+            act_fn=act_fn,
+        )

-        self.add_time_proj = Timesteps(addition_time_embed_dim, True, downscale_freq_shift=0)
-        self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim)
+        self.add_time_proj = Timesteps(
+            addition_time_embed_dim, flip_sin_to_cos, downscale_freq_shift=0
+        )
+        self.add_embedding = TimestepEmbedding(
+            projection_class_embeddings_input_dim, time_embed_dim
+        )

        self.down_blocks = nn.ModuleList([])
        self.up_blocks = nn.ModuleList([])
@@ -148,6 +273,9 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
        if isinstance(num_attention_heads, int):
            num_attention_heads = (num_attention_heads,) * len(down_block_types)

+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+
        if isinstance(cross_attention_dim, int):
            cross_attention_dim = (cross_attention_dim,) * len(down_block_types)

@@ -155,7 +283,9 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
            layers_per_block = [layers_per_block] * len(down_block_types)

        if isinstance(transformer_layers_per_block, int):
-            transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types)
+            transformer_layers_per_block = [transformer_layers_per_block] * len(
+                down_block_types
+            )

        blocks_time_embed_dim = time_embed_dim

@@ -174,10 +304,16 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
                out_channels=output_channel,
                temb_channels=blocks_time_embed_dim,
                add_downsample=not is_final_block,
-                resnet_eps=1e-5,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
                cross_attention_dim=cross_attention_dim[i],
                num_attention_heads=num_attention_heads[i],
-                resnet_act_fn="silu",
+                downsample_padding=1,
+                dropout=dropout,
+                kernel_size_3d=kernel_size_3d,
+                merge_factor=merge_factor,
+                merge_strategy=merge_strategy,
            )
            self.down_blocks.append(down_block)

@@ -186,8 +322,15 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
            block_out_channels[-1],
            temb_channels=blocks_time_embed_dim,
            transformer_layers_per_block=transformer_layers_per_block[-1],
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
            cross_attention_dim=cross_attention_dim[-1],
            num_attention_heads=num_attention_heads[-1],
+            resnet_groups=norm_num_groups,
+            dropout=dropout,
+            kernel_size_3d=kernel_size_3d,
+            merge_factor=merge_factor,
+            merge_strategy=merge_strategy,
        )

        # count how many layers upsample the images
@@ -198,7 +341,9 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
        reversed_num_attention_heads = list(reversed(num_attention_heads))
        reversed_layers_per_block = list(reversed(layers_per_block))
        reversed_cross_attention_dim = list(reversed(cross_attention_dim))
-        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
+        reversed_transformer_layers_per_block = list(
+            reversed(transformer_layers_per_block)
+        )

        output_channel = reversed_block_out_channels[0]
        for i, up_block_type in enumerate(up_block_types):
@@ -206,7 +351,9 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL

            prev_output_channel = output_channel
            output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            input_channel = reversed_block_out_channels[
+                min(i + 1, len(block_out_channels) - 1)
+            ]

            # add upsample block for all BUT final layer
            if not is_final_block:
@@ -224,24 +371,32 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
                prev_output_channel=prev_output_channel,
                temb_channels=blocks_time_embed_dim,
                add_upsample=add_upsample,
-                resnet_eps=1e-5,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
                resolution_idx=i,
+                resnet_groups=norm_num_groups,
                cross_attention_dim=reversed_cross_attention_dim[i],
                num_attention_heads=reversed_num_attention_heads[i],
-                resnet_act_fn="silu",
+                dropout=dropout,
+                kernel_size_3d=kernel_size_3d,
+                merge_factor=merge_factor,
+                merge_strategy=merge_strategy,
            )
            self.up_blocks.append(up_block)
            prev_output_channel = output_channel

        # out
-        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=32, eps=1e-5)
-        self.conv_act = nn.SiLU()
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps
+        )
+        self.conv_act = get_activation(act_fn)

+        conv_out_padding = (conv_out_kernel - 1) // 2
        self.conv_out = nn.Conv2d(
            block_out_channels[0],
            out_channels,
-            kernel_size=3,
-            padding=1,
+            kernel_size=conv_out_kernel,
+            padding=conv_out_padding,
        )

    @property
@@ -260,7 +415,9 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
            processors: Dict[str, AttentionProcessor],
        ):
            if hasattr(module, "get_processor"):
-                processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True)
+                processors[f"{name}.processor"] = module.get_processor(
+                    return_deprecated_lora=True
+                )

            for sub_name, child in module.named_children():
                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
@@ -272,7 +429,11 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL

        return processors

-    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+    def set_attn_processor(
+        self,
+        processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]],
+        _remove_lora=False,
+    ):
        r"""
        Sets the attention processor to use to compute attention.

@@ -296,9 +457,11 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
            if hasattr(module, "set_processor"):
                if not isinstance(processor, dict):
-                    module.set_processor(processor)
+                    module.set_processor(processor, _remove_lora=_remove_lora)
                else:
-                    module.set_processor(processor.pop(f"{name}.processor"))
+                    module.set_processor(
+                        processor.pop(f"{name}.processor"), _remove_lora=_remove_lora
+                    )

            for sub_name, child in module.named_children():
                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
@@ -310,48 +473,132 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
        """
        Disables custom attention processors and sets the default attention implementation.
        """
-        if all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()):
+        if all(
+            proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS
+            for proc in self.attn_processors.values()
+        ):
+            processor = AttnAddedKVProcessor()
+        elif all(
+            proc.__class__ in CROSS_ATTENTION_PROCESSORS
+            for proc in self.attn_processors.values()
+        ):
            processor = AttnProcessor()
        else:
            raise ValueError(
                f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}"
            )

-        self.set_attn_processor(processor)
+        self.set_attn_processor(processor, _remove_lora=True)
+
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+
+        When this option is enabled, the attention module splits the input tensor in slices to compute attention in
+        several steps. This is useful for saving some memory in exchange for a small decrease in speed.
+
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If
+                `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+
+        def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+
+            for child in module.children():
+                fn_recursive_retrieve_sliceable_dims(child)
+
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_sliceable_dims(module)
+
+        num_sliceable_layers = len(sliceable_head_dims)
+
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_sliceable_layers * [1]
+
+        slice_size = (
+            num_sliceable_layers * [slice_size]
+            if not isinstance(slice_size, list)
+            else slice_size
+        )
+
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(
+            module: torch.nn.Module, slice_size: List[int]
+        ):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)

    def _set_gradient_checkpointing(self, module, value=False):
        if hasattr(module, "gradient_checkpointing"):
            module.gradient_checkpointing = value

-    # Copied from diffusers.models.unet_3d_condition.UNet3DConditionModel.enable_forward_chunking
-    def enable_forward_chunking(self, chunk_size: Optional[int] = None, dim: int = 0) -> None:
+    def enable_freeu(self, s1, s2, b1, b2):
+        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.
+
+        The suffixes after the scaling factors represent the stage blocks where they are being applied.
+
+        Please refer to the [official repository](https://github.com/ChenyangSi/FreeU) for combinations of values that
+        are known to work well for different pipelines such as Stable Diffusion v1, v2, and Stable Diffusion XL.
+
+        Args:
+            s1 (`float`):
+                Scaling factor for stage 1 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            s2 (`float`):
+                Scaling factor for stage 2 to attenuate the contributions of the skip features. This is done to
+                mitigate the "oversmoothing effect" in the enhanced denoising process.
+            b1 (`float`): Scaling factor for stage 1 to amplify the contributions of backbone features.
+            b2 (`float`): Scaling factor for stage 2 to amplify the contributions of backbone features.
        """
-        Sets the attention processor to use [feed forward
-        chunking](https://huggingface.co/blog/reformer#2-chunked-feed-forward-layers).
+        for i, upsample_block in enumerate(self.up_blocks):
+            setattr(upsample_block, "s1", s1)
+            setattr(upsample_block, "s2", s2)
+            setattr(upsample_block, "b1", b1)
+            setattr(upsample_block, "b2", b2)

-        Parameters:
-            chunk_size (`int`, *optional*):
-                The chunk size of the feed-forward layers. If not specified, will run feed-forward layer individually
-                over each tensor of dim=`dim`.
-            dim (`int`, *optional*, defaults to `0`):
-                The dimension over which the feed-forward computation should be chunked. Choose between dim=0 (batch)
-                or dim=1 (sequence length).
-        """
-        if dim not in [0, 1]:
-            raise ValueError(f"Make sure to set `dim` to either 0 or 1, not {dim}")
-
-        # By default chunk size is 1
-        chunk_size = chunk_size or 1
-
-        def fn_recursive_feed_forward(module: torch.nn.Module, chunk_size: int, dim: int):
-            if hasattr(module, "set_chunk_feed_forward"):
-                module.set_chunk_feed_forward(chunk_size=chunk_size, dim=dim)
-
-            for child in module.children():
-                fn_recursive_feed_forward(child, chunk_size, dim)
-
-        for module in self.children():
-            fn_recursive_feed_forward(module, chunk_size, dim)
+    def disable_freeu(self):
+        """Disables the FreeU mechanism."""
+        freeu_keys = {"s1", "s2", "b1", "b2"}
+        for i, upsample_block in enumerate(self.up_blocks):
+            for k in freeu_keys:
+                if (
+                    hasattr(upsample_block, k)
+                    or getattr(upsample_block, k, None) is not None
+                ):
+                    setattr(upsample_block, k, None)

    def forward(
        self,
@@ -359,28 +606,88 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
        timestep: Union[torch.Tensor, float, int],
        encoder_hidden_states: torch.Tensor,
        added_time_ids: torch.Tensor,
+        image_only_indicator: Optional[torch.Tensor] = None,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
        return_dict: bool = True,
    ) -> Union[UNetSpatioTemporalConditionOutput, Tuple]:
        r"""
-        The [`UNetSpatioTemporalConditionModel`] forward method.
+        The [`UNet2DConditionModel`] forward method.

        Args:
            sample (`torch.FloatTensor`):
-                The noisy input tensor with the following shape `(batch, num_frames, channel, height, width)`.
+                The noisy input tensor with the following shape `(batch, channel, height, width)`.
            timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input.
            encoder_hidden_states (`torch.FloatTensor`):
-                The encoder hidden states with shape `(batch, sequence_length, cross_attention_dim)`.
-            added_time_ids: (`torch.FloatTensor`):
-                The additional time ids with shape `(batch, num_additional_ids)`. These are encoded with sinusoidal
-                embeddings and added to the time embeddings.
+            attention_mask (`torch.Tensor`, *optional*, defaults to `None`):
+                An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask
+                is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large
+                negative values to the attention scores corresponding to "discard" tokens.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            added_cond_kwargs: (`dict`):
+                A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            encoder_attention_mask (`torch.Tensor`):
+                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
+                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
+                which adds large negative values to the attention scores corresponding to "discard" tokens.
            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] instead of a plain
+                Whether or not to return a [`~models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
        Returns:
-            [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] or `tuple`:
-                If `return_dict` is True, an [`~models.unet_slatio_temporal.UNetSpatioTemporalConditionOutput`] is returned, otherwise
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+                If `return_dict` is True, an [`~models.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise
                a `tuple` is returned where the first element is the sample tensor.
        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layers).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+
+        for dim in sample.shape[-2:]:
+            if dim % default_overall_up_factor != 0:
+                # Forward upsample size to force interpolation output size.
+                forward_upsample_size = True
+                break
+
+        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension
+        # expects mask of shape:
+        #   [batch, key_tokens]
+        # adds singleton query_tokens dimension:
+        #   [batch,                    1, key_tokens]
+        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
+        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
+        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
+        if attention_mask is not None:
+            # assume that mask is expressed as:
+            #   (1 = keep,      0 = discard)
+            # convert mask into a bias that can be added to attention scores:
+            #       (keep = +0,     discard = -10000.0)
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+
+        # convert encoder_attention_mask to a bias the same way we do for attention_mask
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = (
+                1 - encoder_attention_mask.to(sample.dtype)
+            ) * -10000.0
+            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
+
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
        # 1. time
        timesteps = timestep
        if not torch.is_tensor(timesteps):
@@ -421,50 +728,106 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
        # emb: [batch, channels] -> [batch * frames, channels]
        emb = emb.repeat_interleave(num_frames, dim=0)
        # encoder_hidden_states: [batch, 1, channels] -> [batch * frames, 1, channels]
-        encoder_hidden_states = encoder_hidden_states.repeat_interleave(num_frames, dim=0)
+        encoder_hidden_states = encoder_hidden_states.repeat_interleave(
+            num_frames, dim=0
+        )

        # 2. pre-process
        sample = self.conv_in(sample)

-        image_only_indicator = torch.zeros(batch_size, num_frames, dtype=sample.dtype, device=sample.device)
+        # 3. down
+        lora_scale = (
+            cross_attention_kwargs.get("scale", 1.0)
+            if cross_attention_kwargs is not None
+            else 1.0
+        )
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+
+        image_only_indicator = torch.zeros(
+            batch_size, num_frames, dtype=sample.dtype, device=sample.device
+        )

        down_block_res_samples = (sample,)
        for downsample_block in self.down_blocks:
-            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+            if (
+                hasattr(downsample_block, "has_cross_attention")
+                and downsample_block.has_cross_attention
+            ):
                sample, res_samples = downsample_block(
                    hidden_states=sample,
                    temb=emb,
                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    num_video_frames=num_frames,
                    image_only_indicator=image_only_indicator,
                )
            else:
                sample, res_samples = downsample_block(
                    hidden_states=sample,
                    temb=emb,
+                    scale=lora_scale,
+                    num_video_frames=num_frames,
                    image_only_indicator=image_only_indicator,
                )

            down_block_res_samples += res_samples

        # 4. mid
-        sample = self.mid_block(
-            hidden_states=sample,
-            temb=emb,
-            encoder_hidden_states=encoder_hidden_states,
-            image_only_indicator=image_only_indicator,
-        )
+        if self.mid_block is not None:
+            if (
+                hasattr(self.mid_block, "has_cross_attention")
+                and self.mid_block.has_cross_attention
+            ):
+                sample = self.mid_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    num_video_frames=num_frames,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    encoder_attention_mask=encoder_attention_mask,
+                    image_only_indicator=image_only_indicator,
+                )
+            else:
+                sample = self.mid_block(
+                    sample,
+                    temb=emb,
+                    num_video_frames=num_frames,
+                    image_only_indicator=image_only_indicator,
+                )

        # 5. up
        for i, upsample_block in enumerate(self.up_blocks):
-            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
-            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            is_final_block = i == len(self.up_blocks) - 1

-            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[
+                : -len(upsample_block.resnets)
+            ]
+
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+
+            if (
+                hasattr(upsample_block, "has_cross_attention")
+                and upsample_block.has_cross_attention
+            ):
                sample = upsample_block(
                    hidden_states=sample,
                    temb=emb,
                    res_hidden_states_tuple=res_samples,
                    encoder_hidden_states=encoder_hidden_states,
+                    cross_attention_kwargs=cross_attention_kwargs,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                    encoder_attention_mask=encoder_attention_mask,
+                    num_video_frames=num_frames,
                    image_only_indicator=image_only_indicator,
                )
            else:
@@ -472,6 +835,9 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
                    hidden_states=sample,
                    temb=emb,
                    res_hidden_states_tuple=res_samples,
+                    upsample_size=upsample_size,
+                    scale=lora_scale,
+                    num_video_frames=num_frames,
                    image_only_indicator=image_only_indicator,
                )

@@ -483,6 +849,10 @@ class UNetSpatioTemporalConditionModel(ModelMixin, ConfigMixin, UNet2DConditionL
        # 7. Reshape back to original shape
        sample = sample.reshape(batch_size, num_frames, *sample.shape[1:])

+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+
        if not return_dict:
            return (sample,)

@@ -127,11 +127,15 @@ class Encoder(nn.Module):
        )

        # out
-        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_norm_out = nn.GroupNorm(
+            num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6
+        )
        self.conv_act = nn.SiLU()

        conv_out_channels = 2 * out_channels if double_z else out_channels
-        self.conv_out = nn.Conv2d(block_out_channels[-1], conv_out_channels, 3, padding=1)
+        self.conv_out = nn.Conv2d(
+            block_out_channels[-1], conv_out_channels, 3, padding=1
+        )

        self.gradient_checkpointing = False

@@ -160,9 +164,13 @@ class Encoder(nn.Module):
                )
            else:
                for down_block in self.down_blocks:
-                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(down_block), sample)
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(down_block), sample
+                    )
                # middle
-                sample = torch.utils.checkpoint.checkpoint(create_custom_forward(self.mid_block), sample)
+                sample = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.mid_block), sample
+                )

        else:
            # down
@@ -272,7 +280,9 @@ class Decoder(nn.Module):
        if norm_type == "spatial":
            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
        else:
-            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6
+            )
        self.conv_act = nn.SiLU()
        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)

@@ -323,7 +333,9 @@ class Decoder(nn.Module):

                # up
                for up_block in self.up_blocks:
-                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block), sample, latent_embeds
+                    )
        else:
            # middle
            sample = self.mid_block(sample, latent_embeds)
@@ -363,7 +375,9 @@ class UpSample(nn.Module):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
-        self.deconv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=4, stride=2, padding=1)
+        self.deconv = nn.ConvTranspose2d(
+            in_channels, out_channels, kernel_size=4, stride=2, padding=1
+        )

    def forward(self, x: torch.FloatTensor) -> torch.FloatTensor:
        r"""The forward method of the `UpSample` class."""
@@ -407,9 +421,13 @@ class MaskConditionEncoder(nn.Module):
        for l in range(len(out_channels)):
            out_ch_ = out_channels[l]
            if l == 0 or l == 1:
-                layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=3, stride=1, padding=1))
+                layers.append(
+                    nn.Conv2d(in_ch_, out_ch_, kernel_size=3, stride=1, padding=1)
+                )
            else:
-                layers.append(nn.Conv2d(in_ch_, out_ch_, kernel_size=4, stride=2, padding=1))
+                layers.append(
+                    nn.Conv2d(in_ch_, out_ch_, kernel_size=4, stride=2, padding=1)
+                )
            in_ch_ = out_ch_

        self.layers = nn.Sequential(*layers)
@@ -524,7 +542,9 @@ class MaskConditionDecoder(nn.Module):
        if norm_type == "spatial":
            self.conv_norm_out = SpatialNorm(block_out_channels[0], temb_channels)
        else:
-            self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+            self.conv_norm_out = nn.GroupNorm(
+                num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6
+            )
        self.conv_act = nn.SiLU()
        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)

@@ -574,7 +594,9 @@ class MaskConditionDecoder(nn.Module):
                for up_block in self.up_blocks:
                    if image is not None and mask is not None:
                        sample_ = im_x[str(tuple(sample.shape))]
-                        mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
+                        mask_ = nn.functional.interpolate(
+                            mask, size=sample.shape[-2:], mode="nearest"
+                        )
                        sample = sample * mask_ + sample_ * (1 - mask_)
                    sample = torch.utils.checkpoint.checkpoint(
                        create_custom_forward(up_block),
@@ -604,9 +626,13 @@ class MaskConditionDecoder(nn.Module):
                for up_block in self.up_blocks:
                    if image is not None and mask is not None:
                        sample_ = im_x[str(tuple(sample.shape))]
-                        mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
+                        mask_ = nn.functional.interpolate(
+                            mask, size=sample.shape[-2:], mode="nearest"
+                        )
                        sample = sample * mask_ + sample_ * (1 - mask_)
-                    sample = torch.utils.checkpoint.checkpoint(create_custom_forward(up_block), sample, latent_embeds)
+                    sample = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(up_block), sample, latent_embeds
+                    )
                if image is not None and mask is not None:
                    sample = sample * mask + im_x[str(tuple(sample.shape))] * (1 - mask)
        else:
@@ -623,7 +649,9 @@ class MaskConditionDecoder(nn.Module):
            for up_block in self.up_blocks:
                if image is not None and mask is not None:
                    sample_ = im_x[str(tuple(sample.shape))]
-                    mask_ = nn.functional.interpolate(mask, size=sample.shape[-2:], mode="nearest")
+                    mask_ = nn.functional.interpolate(
+                        mask, size=sample.shape[-2:], mode="nearest"
+                    )
                    sample = sample * mask_ + sample_ * (1 - mask_)
                sample = up_block(sample, latent_embeds)
            if image is not None and mask is not None:
@@ -695,7 +723,9 @@ class VectorQuantizer(nn.Module):
        new = match.argmax(-1)
        unknown = match.sum(2) < 1
        if self.unknown_index == "random":
-            new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(device=new.device)
+            new[unknown] = torch.randint(0, self.re_embed, size=new[unknown].shape).to(
+                device=new.device
+            )
        else:
            new[unknown] = self.unknown_index
        return new.reshape(ishape)
@@ -710,13 +740,17 @@ class VectorQuantizer(nn.Module):
        back = torch.gather(used[None, :][inds.shape[0] * [0], :], 1, inds)
        return back.reshape(ishape)

-    def forward(self, z: torch.FloatTensor) -> Tuple[torch.FloatTensor, torch.FloatTensor, Tuple]:
+    def forward(
+        self, z: torch.FloatTensor
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor, Tuple]:
        # reshape z -> (batch, height, width, channel) and flatten
        z = z.permute(0, 2, 3, 1).contiguous()
        z_flattened = z.view(-1, self.vq_embed_dim)

        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
-        min_encoding_indices = torch.argmin(torch.cdist(z_flattened, self.embedding.weight), dim=1)
+        min_encoding_indices = torch.argmin(
+            torch.cdist(z_flattened, self.embedding.weight), dim=1
+        )

        z_q = self.embedding(min_encoding_indices).view(z.shape)
        perplexity = None
@@ -724,9 +758,13 @@ class VectorQuantizer(nn.Module):

        # compute loss for embedding
        if not self.legacy:
-            loss = self.beta * torch.mean((z_q.detach() - z) ** 2) + torch.mean((z_q - z.detach()) ** 2)
+            loss = self.beta * torch.mean((z_q.detach() - z) ** 2) + torch.mean(
+                (z_q - z.detach()) ** 2
+            )
        else:
-            loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean((z_q - z.detach()) ** 2)
+            loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean(
+                (z_q - z.detach()) ** 2
+            )

        # preserve gradients
        z_q: torch.FloatTensor = z + (z_q - z).detach()
@@ -735,16 +773,22 @@ class VectorQuantizer(nn.Module):
        z_q = z_q.permute(0, 3, 1, 2).contiguous()

        if self.remap is not None:
-            min_encoding_indices = min_encoding_indices.reshape(z.shape[0], -1)  # add batch axis
+            min_encoding_indices = min_encoding_indices.reshape(
+                z.shape[0], -1
+            )  # add batch axis
            min_encoding_indices = self.remap_to_used(min_encoding_indices)
            min_encoding_indices = min_encoding_indices.reshape(-1, 1)  # flatten

        if self.sane_index_shape:
-            min_encoding_indices = min_encoding_indices.reshape(z_q.shape[0], z_q.shape[2], z_q.shape[3])
+            min_encoding_indices = min_encoding_indices.reshape(
+                z_q.shape[0], z_q.shape[2], z_q.shape[3]
+            )

        return z_q, loss, (perplexity, min_encodings, min_encoding_indices)

-    def get_codebook_entry(self, indices: torch.LongTensor, shape: Tuple[int, ...]) -> torch.FloatTensor:
+    def get_codebook_entry(
+        self, indices: torch.LongTensor, shape: Tuple[int, ...]
+    ) -> torch.FloatTensor:
        # shape specifying (batch, height, width, channel)
        if self.remap is not None:
            indices = indices.reshape(shape[0], -1)  # add batch axis
@@ -805,7 +849,9 @@ class DiagonalGaussianDistribution(object):
                    dim=[1, 2, 3],
                )

-    def nll(self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]) -> torch.Tensor:
+    def nll(
+        self, sample: torch.Tensor, dims: Tuple[int, ...] = [1, 2, 3]
+    ) -> torch.Tensor:
        if self.deterministic:
            return torch.Tensor([0.0])
        logtwopi = np.log(2.0 * np.pi)
@@ -851,7 +897,9 @@ class EncoderTiny(nn.Module):
            num_channels = block_out_channels[i]

            if i == 0:
-                layers.append(nn.Conv2d(in_channels, num_channels, kernel_size=3, padding=1))
+                layers.append(
+                    nn.Conv2d(in_channels, num_channels, kernel_size=3, padding=1)
+                )
            else:
                layers.append(
                    nn.Conv2d(
@@ -867,7 +915,9 @@ class EncoderTiny(nn.Module):
            for _ in range(num_block):
                layers.append(AutoencoderTinyBlock(num_channels, num_channels, act_fn))

-        layers.append(nn.Conv2d(block_out_channels[-1], out_channels, kernel_size=3, padding=1))
+        layers.append(
+            nn.Conv2d(block_out_channels[-1], out_channels, kernel_size=3, padding=1)
+        )

        self.layers = nn.Sequential(*layers)
        self.gradient_checkpointing = False
@@ -883,9 +933,13 @@ class EncoderTiny(nn.Module):
                return custom_forward

            if is_torch_version(">=", "1.11.0"):
-                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x, use_reentrant=False)
+                x = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.layers), x, use_reentrant=False
+                )
            else:
-                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x)
+                x = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.layers), x
+                )

        else:
            # scale image from [-1, 1] to [0, 1] to match TAESD convention
@@ -968,9 +1022,13 @@ class DecoderTiny(nn.Module):
                return custom_forward

            if is_torch_version(">=", "1.11.0"):
-                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x, use_reentrant=False)
+                x = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.layers), x, use_reentrant=False
+                )
            else:
-                x = torch.utils.checkpoint.checkpoint(create_custom_forward(self.layers), x)
+                x = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(self.layers), x
+                )

        else:
            x = self.layers(x)
@@ -17,12 +17,7 @@ from ..utils import (

 # These modules contain pipelines from multiple libraries/frameworks
 _dummy_objects = {}
-_import_structure = {
-    "controlnet": [],
-    "latent_diffusion": [],
-    "stable_diffusion": [],
-    "stable_diffusion_xl": [],
-}
+_import_structure = {"stable_diffusion": [], "stable_diffusion_xl": [], "latent_diffusion": [], "controlnet": []}

 try:
    if not is_torch_available():
@@ -44,11 +39,7 @@ else:
    _import_structure["dit"] = ["DiTPipeline"]
    _import_structure["latent_diffusion"].extend(["LDMSuperResolutionPipeline"])
    _import_structure["latent_diffusion_uncond"] = ["LDMPipeline"]
-    _import_structure["pipeline_utils"] = [
-        "AudioPipelineOutput",
-        "DiffusionPipeline",
-        "ImagePipelineOutput",
-    ]
+    _import_structure["pipeline_utils"] = ["AudioPipelineOutput", "DiffusionPipeline", "ImagePipelineOutput"]
    _import_structure["pndm"] = ["PNDMPipeline"]
    _import_structure["repaint"] = ["RePaintPipeline"]
    _import_structure["score_sde_ve"] = ["ScoreSdeVePipeline"]
@@ -70,10 +61,7 @@ except OptionalDependencyNotAvailable:

    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
-    _import_structure["alt_diffusion"] = [
-        "AltDiffusionImg2ImgPipeline",
-        "AltDiffusionPipeline",
-    ]
+    _import_structure["alt_diffusion"] = ["AltDiffusionImg2ImgPipeline", "AltDiffusionPipeline"]
    _import_structure["animatediff"] = ["AnimateDiffPipeline"]
    _import_structure["audioldm"] = ["AudioLDMPipeline"]
    _import_structure["audioldm2"] = [
@@ -122,10 +110,7 @@ else:
        "KandinskyV22PriorEmb2EmbPipeline",
        "KandinskyV22PriorPipeline",
    ]
-    _import_structure["kandinsky3"] = [
-        "Kandinsky3Img2ImgPipeline",
-        "Kandinsky3Pipeline",
-    ]
+    _import_structure["kandinsky3"] = ["Kandinsky3Img2ImgPipeline", "Kandinsky3Pipeline"]
    _import_structure["latent_consistency_models"] = [
        "LatentConsistencyModelImg2ImgPipeline",
        "LatentConsistencyModelPipeline",
@@ -160,12 +145,12 @@ else:
            "StableDiffusionPix2PixZeroPipeline",
            "StableDiffusionSAGPipeline",
            "StableDiffusionUpscalePipeline",
+            "StableDiffusionVideoPipeline",
            "StableUnCLIPImg2ImgPipeline",
            "StableUnCLIPPipeline",
        ]
    )
    _import_structure["stable_diffusion_safe"] = ["StableDiffusionPipelineSafe"]
-    _import_structure["stable_video_diffusion"] = ["StableVideoDiffusionPipeline"]
    _import_structure["stable_diffusion_xl"].extend(
        [
            "StableDiffusionXLImg2ImgPipeline",
@@ -174,14 +159,10 @@ else:
            "StableDiffusionXLPipeline",
        ]
    )
-    _import_structure["t2i_adapter"] = [
-        "StableDiffusionAdapterPipeline",
-        "StableDiffusionXLAdapterPipeline",
-    ]
+    _import_structure["t2i_adapter"] = ["StableDiffusionAdapterPipeline", "StableDiffusionXLAdapterPipeline"]
    _import_structure["text_to_video_synthesis"] = [
        "TextToVideoSDPipeline",
        "TextToVideoZeroPipeline",
-        "TextToVideoZeroSDXLPipeline",
        "VideoToVideoSDPipeline",
    ]
    _import_structure["unclip"] = ["UnCLIPImageVariationPipeline", "UnCLIPPipeline"]
@@ -235,9 +216,7 @@ try:
    if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
        raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ..utils import (
-        dummy_torch_and_transformers_and_k_diffusion_objects,
-    )
+    from ..utils import dummy_torch_and_transformers_and_k_diffusion_objects  # noqa F403

    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_k_diffusion_objects))
 else:
@@ -280,10 +259,7 @@ except OptionalDependencyNotAvailable:

    _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
 else:
-    _import_structure["spectrogram_diffusion"] = [
-        "MidiProcessor",
-        "SpectrogramDiffusionPipeline",
-    ]
+    _import_structure["spectrogram_diffusion"] = ["MidiProcessor", "SpectrogramDiffusionPipeline"]

 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
@@ -293,11 +269,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from ..utils.dummy_pt_objects import *  # noqa F403

    else:
-        from .auto_pipeline import (
-            AutoPipelineForImage2Image,
-            AutoPipelineForInpainting,
-            AutoPipelineForText2Image,
-        )
+        from .auto_pipeline import AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image
        from .consistency_models import ConsistencyModelPipeline
        from .dance_diffusion import DanceDiffusionPipeline
        from .ddim import DDIMPipeline
@@ -305,11 +277,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .dit import DiTPipeline
        from .latent_diffusion import LDMSuperResolutionPipeline
        from .latent_diffusion_uncond import LDMPipeline
-        from .pipeline_utils import (
-            AudioPipelineOutput,
-            DiffusionPipeline,
-            ImagePipelineOutput,
-        )
+        from .pipeline_utils import AudioPipelineOutput, DiffusionPipeline, ImagePipelineOutput
        from .pndm import PNDMPipeline
        from .repaint import RePaintPipeline
        from .score_sde_ve import ScoreSdeVePipeline
@@ -332,11 +300,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline
        from .animatediff import AnimateDiffPipeline
        from .audioldm import AudioLDMPipeline
-        from .audioldm2 import (
-            AudioLDM2Pipeline,
-            AudioLDM2ProjectionModel,
-            AudioLDM2UNet2DConditionModel,
-        )
+        from .audioldm2 import AudioLDM2Pipeline, AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
        from .blip_diffusion import BlipDiffusionPipeline
        from .controlnet import (
            BlipDiffusionControlNetPipeline,
@@ -380,10 +344,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            Kandinsky3Img2ImgPipeline,
            Kandinsky3Pipeline,
        )
-        from .latent_consistency_models import (
-            LatentConsistencyModelImg2ImgPipeline,
-            LatentConsistencyModelPipeline,
-        )
+        from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
        from .latent_diffusion import LDMTextToImagePipeline
        from .musicldm import MusicLDMPipeline
        from .paint_by_example import PaintByExamplePipeline
@@ -412,6 +373,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            StableDiffusionPix2PixZeroPipeline,
            StableDiffusionSAGPipeline,
            StableDiffusionUpscalePipeline,
+            StableDiffusionVideoPipeline,
            StableUnCLIPImg2ImgPipeline,
            StableUnCLIPPipeline,
        )
@@ -422,15 +384,10 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            StableDiffusionXLInstructPix2PixPipeline,
            StableDiffusionXLPipeline,
        )
-        from .stable_video_diffusion import StableVideoDiffusionPipeline
-        from .t2i_adapter import (
-            StableDiffusionAdapterPipeline,
-            StableDiffusionXLAdapterPipeline,
-        )
+        from .t2i_adapter import StableDiffusionAdapterPipeline, StableDiffusionXLAdapterPipeline
        from .text_to_video_synthesis import (
            TextToVideoSDPipeline,
            TextToVideoZeroPipeline,
-            TextToVideoZeroSDXLPipeline,
            VideoToVideoSDPipeline,
        )
        from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline
@@ -516,10 +473,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            from ..utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403

        else:
-            from .spectrogram_diffusion import (
-                MidiProcessor,
-                SpectrogramDiffusionPipeline,
-            )
+            from .spectrogram_diffusion import MidiProcessor, SpectrogramDiffusionPipeline

 else:
    import sys
@@ -22,7 +22,7 @@ from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection, XLMR
 from ...configuration_utils import FrozenDict
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
-from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
+from ...models import AutoencoderKL, UNet2DConditionModel
 from ...models.lora import adjust_lora_scale_text_encoder
 from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
@@ -494,29 +494,18 @@ class AltDiffusionPipeline(

        return prompt_embeds, negative_prompt_embeds

-    def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None):
+    def encode_image(self, image, device, num_images_per_prompt):
        dtype = next(self.image_encoder.parameters()).dtype

        if not isinstance(image, torch.Tensor):
            image = self.feature_extractor(image, return_tensors="pt").pixel_values

        image = image.to(device=device, dtype=dtype)
-        if output_hidden_states:
-            image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2]
-            image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_enc_hidden_states = self.image_encoder(
-                torch.zeros_like(image), output_hidden_states=True
-            ).hidden_states[-2]
-            uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(
-                num_images_per_prompt, dim=0
-            )
-            return image_enc_hidden_states, uncond_image_enc_hidden_states
-        else:
-            image_embeds = self.image_encoder(image).image_embeds
-            image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-            uncond_image_embeds = torch.zeros_like(image_embeds)
+        image_embeds = self.image_encoder(image).image_embeds
+        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)

-            return image_embeds, uncond_image_embeds
+        uncond_image_embeds = torch.zeros_like(image_embeds)
+        return image_embeds, uncond_image_embeds

    def run_safety_checker(self, image, device, dtype):
        if self.safety_checker is None:
@@ -886,10 +875,7 @@ class AltDiffusionPipeline(
            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])

        if ip_adapter_image is not None:
-            output_hidden_state = False if isinstance(self.unet.encoder_hid_proj, ImageProjection) else True
-            image_embeds, negative_image_embeds = self.encode_image(
-                ip_adapter_image, device, num_images_per_prompt, output_hidden_state
-            )
+            image_embeds, negative_image_embeds = self.encode_image(ip_adapter_image, device, num_images_per_prompt)
            if self.do_classifier_free_guidance:
                image_embeds = torch.cat([negative_image_embeds, image_embeds])

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Dhruv Nair	56e8fca572	Merge branch 'main' into test-v	2023-11-27 13:36:38 +00:00
Dhruv Nair	c5941a26a4	Merge branch 'test-v' of https://github.com/huggingface/diffusers into test-v	2023-11-27 13:35:36 +00:00
Dhruv Nair	8bc42512fe	remove post quant conv	2023-11-27 13:27:46 +00:00
patil-suraj	55b4d09080	fix upcasting	2023-11-27 14:11:26 +01:00
patil-suraj	c452d9c042	up	2023-11-27 13:59:30 +01:00
patil-suraj	ee9f7d2493	make added_time_ids is tensor	2023-11-27 13:55:02 +01:00
Dhruv Nair	8620851aa0	update forward pass for gradient checkpointing	2023-11-27 12:50:58 +00:00
patil-suraj	90d8e832f8	upcast vae	2023-11-27 13:50:10 +01:00
patil-suraj	18930e0b85	doc	2023-11-27 13:40:30 +01:00
patil-suraj	847bd0a479	fix copies	2023-11-27 13:23:31 +01:00
Dhruv Nair	3178b16b17	update	2023-11-27 11:37:52 +00:00
patil-suraj	a08ef009d1	use math for log	2023-11-27 12:16:02 +01:00
patil-suraj	804bdebe51	Merge branch 'test-v' of https://github.com/huggingface/diffusers into test-v	2023-11-27 12:01:11 +01:00
patil-suraj	a193e49dff	use c_noise values for timesteps	2023-11-27 12:01:08 +01:00
Dhruv Nair	c9d1727613	clean up	2023-11-27 11:00:02 +00:00
Dhruv Nair	82cf60828f	Merge branch 'test-v' of https://github.com/huggingface/diffusers into test-v	2023-11-27 10:50:12 +00:00
Dhruv Nair	26ed460265	clean up	2023-11-27 10:49:58 +00:00
Dhruv Nair	403a81c30d	clean up temp decoder	2023-11-27 10:21:22 +00:00
patil-suraj	1b3cf2db5e	Merge branch 'test-v' of https://github.com/huggingface/diffusers into test-v	2023-11-27 11:13:20 +01:00
patil-suraj	b8d84c4320	fix norm eps in TransformerSpatioTemporalModel	2023-11-27 11:13:18 +01:00
Dhruv Nair	3fbe123d84	make temb optional in Decoder mid block	2023-11-27 10:09:41 +00:00
Dhruv Nair	f7cf8c338c	clean up	2023-11-27 09:53:56 +00:00
Dhruv Nair	ab8076f234	Merge branch 'test-v' of https://github.com/huggingface/diffusers into test-v	2023-11-27 09:50:00 +00:00
Dhruv Nair	7b6a0d48c6	add slow svd test	2023-11-27 09:45:00 +00:00
patil-suraj	6adae54046	clean TransformerSpatioTemporalModel	2023-11-27 10:34:44 +01:00
patil-suraj	af85fb1bc1	clean up unet	2023-11-27 10:03:40 +01:00
Dhruv Nair	760333d524	add unet tests	2023-11-27 08:12:02 +00:00
patil-suraj	f651c12ef8	don't scale image latents	2023-11-26 17:13:04 +01:00
patil-suraj	d614a33a09	use AutoencoderKLTemporalDecoder	2023-11-26 17:00:22 +01:00
patil-suraj	13b646edd3	remove hack	2023-11-26 16:59:21 +01:00
patil-suraj	cb49cbdd29	add pipeline and vae in init	2023-11-26 16:58:59 +01:00
patil-suraj	1ce8ff51e6	accept fps as arg	2023-11-26 16:20:22 +01:00
patil-suraj	fdd182f335	allow passing PIL to export_video	2023-11-26 16:19:25 +01:00
patil-suraj	2a46326c25	up	2023-11-26 16:07:24 +01:00
patil-suraj	e34e9d9a33	take guidance scale as input	2023-11-26 16:06:44 +01:00
patil-suraj	96af28f92b	style	2023-11-26 16:01:32 +01:00
patil-suraj	6827a1dc6a	add vae conversion	2023-11-26 15:42:27 +01:00
patil-suraj	c3bdeb8a4c	skip_post_quant_conv	2023-11-26 13:07:50 +01:00
patil-suraj	cf70b9a0b4	fix missing activation in TemporalDecoder	2023-11-26 13:06:44 +01:00
patil-suraj	712b9950c5	fix guidance_scales dtype	2023-11-26 12:47:51 +01:00
patil-suraj	21148de853	fix typo	2023-11-26 12:45:01 +01:00
patil-suraj	d930977656	fix attention in MidBlockTemporalDecoder	2023-11-26 12:01:14 +01:00
patil-suraj	268ffea0e7	cast alpha to sample dtype	2023-11-26 11:15:28 +01:00
patil-suraj	8bcf43d52a	fix num frames during split decoding	2023-11-26 11:10:42 +01:00
patil-suraj	b071aaa719	switch spatial to temporal for mixing in VAE	2023-11-26 10:51:53 +01:00
patil-suraj	5316fb5107	pass num frames in decode	2023-11-25 19:15:19 +01:00
patil-suraj	9af07d1d5c	fix default values in vae	2023-11-25 19:09:47 +01:00
patil-suraj	d0017d9b70	allow using differnt eps in temporal block for video decoder	2023-11-25 19:02:57 +01:00
patil-suraj	0cf6c6b291	type image_latents same as image_embeddings	2023-11-25 16:20:01 +01:00
patil-suraj	df986274d6	fix dtype in TransformerSpatioTemporalModel	2023-11-25 16:17:45 +01:00
patil-suraj	7ddd14bd94	vae encode/decode in fp32	2023-11-25 16:16:01 +01:00
patil-suraj	4346ddd402	fix decode_latents	2023-11-25 14:33:25 +01:00
patil-suraj	9da55b381c	pass decoding_t to decode_latents	2023-11-25 14:30:27 +01:00
patil-suraj	4d4469ee87	decode n frames at a time	2023-11-25 14:30:09 +01:00
patil-suraj	f9954a0e7b	decode in float32	2023-11-25 14:02:23 +01:00
patil-suraj	e7798333c4	fix frame decodig	2023-11-25 14:01:01 +01:00
patil-suraj	efb1e5e1d8	make pipeline run	2023-11-24 21:30:31 +01:00
Dhruv Nair	beaaf18b2c	Merge branch 'test-v' of https://github.com/huggingface/diffusers into test-v	2023-11-24 16:36:06 +00:00
Dhruv Nair	132fe97bf4	add temporal autoencoder	2023-11-24 16:35:41 +00:00
patil-suraj	2f35e8c94c	fix norm eps in temporal transformers	2023-11-24 15:40:41 +01:00
patil-suraj	b336529573	add guidance scalings	2023-11-24 14:16:50 +01:00
patil-suraj	3e47d3c8ed	adapt scheduler	2023-11-24 14:06:07 +01:00
patil-suraj	122a6bd390	begin pipeline	2023-11-24 13:36:57 +01:00
Dhruv Nair	37c428a79c	Merge branch 'test-v' of https://github.com/huggingface/diffusers into test-v	2023-11-24 12:24:57 +00:00
Dhruv Nair	eefed8ab6b	update up/mid blocks for decoder	2023-11-24 12:23:14 +00:00
Dhruv Nair	05eaec2d39	Merge branch 'test-v-old' into test-v	2023-11-24 12:19:29 +00:00
Dhruv Nair	e68424378f	update vae	2023-11-24 12:19:11 +00:00
patil-suraj	24b5c4360c	check for None	2023-11-24 11:53:50 +01:00
patil-suraj	0c4192b537	up	2023-11-24 11:51:40 +01:00
patil-suraj	dff26ce8af	up	2023-11-24 11:50:02 +01:00
patil-suraj	9f22651c1f	remove more unsed args	2023-11-24 11:48:58 +01:00
patil-suraj	d8c9e67aac	remove unused arg	2023-11-24 11:38:34 +01:00
patil-suraj	6c28367b1a	remove unused arg	2023-11-24 11:36:01 +01:00
patil-suraj	f9def2aeed	add in init	2023-11-24 11:31:30 +01:00
patil-suraj	576fa1c7dc	remove UNetMidBlockSpatioTemporal	2023-11-24 11:30:35 +01:00
patil-suraj	f1457b7e1d	update conversion script	2023-11-24 11:24:42 +01:00
patil-suraj	1f34311eec	rename model	2023-11-24 11:24:34 +01:00
patil-suraj	f976f5a31e	Merge branch 'test-v' of https://github.com/huggingface/diffusers into test-v	2023-11-24 11:17:55 +01:00
patil-suraj	8e1851a16a	Merge branch 'test-v' of https://github.com/huggingface/diffusers into test-v	2023-11-24 11:17:51 +01:00
patil-suraj	6c69c7a0d2	add blocks	2023-11-24 11:11:15 +01:00
Dhruv Nair	6481e9495f	make temb optional	2023-11-24 10:10:09 +00:00
Dhruv Nair	8c3fd58c85	Merge branch 'test-v' of https://github.com/huggingface/diffusers into test-v	2023-11-24 09:51:43 +00:00
Dhruv Nair	9117547ee0	clean up	2023-11-24 09:51:29 +00:00
patil-suraj	af1e86af8d	fix time_context dim	2023-11-24 10:47:44 +01:00
patil-suraj	29551f8e30	fix TransformerSpatioTemporalModel	2023-11-24 10:19:44 +01:00
patil-suraj	661033171b	use TransformerSpatioTemporalModel	2023-11-24 10:16:22 +01:00
patil-suraj	20efe541c5	fix TemporalBasicTransformerBlock	2023-11-24 10:11:40 +01:00
patil-suraj	5a523e21c6	reuse TemporalBasicTransformerBlock	2023-11-24 10:04:22 +01:00
patil-suraj	b0fc4fd4cb	fix SpatioTemporalResBlock	2023-11-24 10:01:09 +01:00
patil-suraj	678d19fa18	fix temb shape	2023-11-24 09:41:15 +01:00
patil-suraj	c8ec445964	style	2023-11-24 09:34:53 +01:00
patil-suraj	ffd9e26a65	use new blocks	2023-11-24 09:26:42 +01:00
patil-suraj	6f87490408	fix shapes in Alphablender and add time activation in res blcok	2023-11-24 08:57:28 +01:00
Dhruv Nair	9c9d46763b	update	2023-11-24 07:12:50 +00:00
Dhruv Nair	47684dab43	update	2023-11-24 04:14:58 +00:00
Dhruv Nair	5218f46173	fix blocks	2023-11-23 14:32:18 +00:00
Dhruv Nair	8ee280773f	add vae blocks	2023-11-23 14:28:07 +00:00
Dhruv Nair	85846f7450	add spatio temporal transformers	2023-11-23 13:02:34 +00:00
patil-suraj	28dee6e735	fix temb shape in TemporalResnetBlock	2023-11-23 13:52:48 +01:00
patil-suraj	165ed7c5d5	return sample in original shape	2023-11-23 13:52:40 +01:00
patil-suraj	d4cdfa33f5	make forward work	2023-11-23 13:35:52 +01:00
Dhruv Nair	1bd09b1489	Merge branch 'test-v' of https://github.com/huggingface/diffusers into test-v	2023-11-23 10:54:08 +00:00
Dhruv Nair	edf7121ec7	add new resnet blocks	2023-11-23 10:53:25 +00:00
patil-suraj	7b64d3a17b	up	2023-11-23 10:48:59 +01:00
patil-suraj	c93606c93c	fix model	2023-11-23 10:47:57 +01:00
patil-suraj	5df09ef355	add conversion script	2023-11-22 19:15:18 +01:00
patil-suraj	ac9473153c	fix add_embedding	2023-11-22 19:04:10 +01:00
patil-suraj	ee9d7b8ecd	fix time_pos_embed	2023-11-22 18:59:44 +01:00
patil-suraj	669824e5bb	fix temporal res block	2023-11-22 17:44:56 +01:00
patil-suraj	45c9b56bf7	use TimestepEmbedding	2023-11-22 15:56:09 +01:00
patil-suraj	cad51d45d1	addition_time_embed_dim	2023-11-22 14:26:43 +01:00
patil-suraj	7de5d7c6fd	add_embedding	2023-11-22 14:06:50 +01:00
patil-suraj	58883ee085	finish blocks	2023-11-22 13:42:10 +01:00
patil-suraj	2f5648177e	begin model	2023-11-21 16:39:15 +01:00