up

2024-03-18 11:47:47 +01:00 · 2024-03-18 11:34:17 +01:00
158 changed files with 328 additions and 3281 deletions
@@ -105,4 +105,4 @@ jobs:
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v \
          --make-reports=tests_${{ matrix.config.report }} \
-          tests/lora/
+          tests/lora/test_lora_layers_peft.py
@@ -21,7 +21,10 @@ env:
 jobs:
  setup_torch_cuda_pipeline_matrix:
    name: Setup Torch Pipelines CUDA Slow Tests Matrix
-    runs-on: ubuntu-latest
+    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    container:
+      image: diffusers/diffusers-pytorch-cpu # this is a CPU image, but we need it to fetch the matrix
+      options: --shm-size "16gb" --ipc host
    outputs:
      pipeline_test_matrix: ${{ steps.fetch_pipeline_matrix.outputs.pipeline_test_matrix }}
    steps:
@@ -29,20 +32,24 @@ jobs:
        uses: actions/checkout@v3
        with:
          fetch-depth: 2
-      - name: Set up Python
-        uses: actions/setup-python@v4
-        with:
-          python-version: "3.8"
      - name: Install dependencies
        run: |
-          pip install -e .
-          pip install huggingface_hub
+          apt-get update && apt-get install libsndfile1-dev libgl1 -y
+          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+          python -m uv pip install -e [quality,test]
+          python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
+
+      - name: Environment
+        run: |
+          python utils/print_env.py
+
      - name: Fetch Pipeline Matrix
        id: fetch_pipeline_matrix
        run: |
          matrix=$(python utils/fetch_torch_cuda_pipeline_test_matrix.py)
          echo $matrix
          echo "pipeline_test_matrix=$matrix" >> $GITHUB_OUTPUT
+
      - name: Pipeline Tests Artifacts
        if: ${{ always() }}
        uses: actions/upload-artifact@v2
@@ -19,16 +19,6 @@ authors:
    family-names: Rasul
  - given-names: Mishig
    family-names: Davaadorj
-  - given-names: Dhruv
-    family-names: Nair
-  - given-names: Sayak
-    family-names: Paul
-  - given-names: Steven
-    family-names: Liu
-  - given-names: William
-    family-names: Berman
-  - given-names: Yiyi
-    family-names: Xu
  - given-names: Thomas
    family-names: Wolf
 repository-code: 'https://github.com/huggingface/diffusers'
@@ -238,7 +238,7 @@ We also want to thank @heejkoo for the very helpful overview of papers, code and

 ```bibtex
@misc{von-platen-etal-2022-diffusers,
-  author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Dhruv Nair and Sayak Paul and William Berman and Yiyi Xu and Steven Liu and Thomas Wolf},
+  author = {Patrick von Platen and Suraj Patil and Anton Lozhkov and Pedro Cuenca and Nathan Lambert and Kashif Rasul and Mishig Davaadorj and Thomas Wolf},
  title = {Diffusers: State-of-the-art diffusion models},
  year = {2022},
  publisher = {GitHub},
@@ -408,29 +408,6 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)

 </Tip>

-<table>
-    <tr>
-      <th align=center>Without FreeInit enabled</th>
-      <th align=center>With FreeInit enabled</th>
-    </tr>
-    <tr>
-        <td align=center>
-          panda playing a guitar
-          <br />
-          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-no-freeinit.gif"
-              alt="panda playing a guitar"
-              style="width: 300px;" />
-        </td>
-        <td align=center>
-          panda playing a guitar
-          <br/>
-          <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/animatediff-freeinit.gif"
-              alt="panda playing a guitar"
-              style="width: 300px;" />
-        </td>
-    </tr>
-</table>
-
 ## Using AnimateLCM

 [AnimateLCM](https://animatelcm.github.io/) is a motion module checkpoint and an [LCM LoRA](https://huggingface.co/docs/diffusers/using-diffusers/inference_with_lcm_lora) that have been created using a consistency learning strategy that decouples the distillation of the image generation priors and the motion generation priors.
@@ -45,7 +45,7 @@ Make sure to include the token `toy_face` in the prompt and then you can perform
 ```python
 prompt = "toy_face of a hacker with a hoodie"

-lora_scale = 0.9
+lora_scale= 0.9
 image = pipe(
    prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
 ).images[0]
@@ -114,7 +114,7 @@ To return to only using one adapter, use the [`~diffusers.loaders.UNet2DConditio
 pipe.set_adapters("toy")

 prompt = "toy_face of a hacker with a hoodie"
-lora_scale = 0.9
+lora_scale= 0.9
 image = pipe(
    prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
 ).images[0]
@@ -127,12 +127,11 @@ Or to disable all adapters entirely, use the [`~diffusers.loaders.UNet2DConditio
 pipe.disable_lora()

 prompt = "toy_face of a hacker with a hoodie"
+lora_scale= 0.9
 image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
 image
 ```

-![no-lora](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_20_1.png)
-
 ## Manage active adapters

 You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, use the [`~diffusers.loaders.LoraLoaderMixin.get_active_adapters`] method to check the list of active adapters:
@@ -239,7 +239,5 @@ pipeline.to("cuda")
 prompt = "柴犬、カラフルアート"

 image = pipeline(prompt=prompt).images[0]
-```

-> [!TIP]
-> When using `trust_remote_code=True`, it is also strongly encouraged to pass a commit hash as a `revision` to make sure the author of the models did not update the code with some malicious new lines (unless you fully trust the authors of the models).
+```
@@ -60,23 +60,6 @@ repo_id = "runwayml/stable-diffusion-v1-5"
 pipe = StableDiffusionImg2ImgPipeline.from_pretrained(repo_id)
 ```

-You can use the Space below to gauge the memory requirements of a pipeline you want to load beforehand without downloading the pipeline checkpoints:
-
-<div class="block dark:hidden">
-	<iframe 
-        src="https://diffusers-compute-pipeline-size.hf.space?__theme=light"
-        width="850"
-        height="1600"
-    ></iframe>
-</div>
-<div class="hidden dark:block">
-    <iframe 
-        src="https://diffusers-compute-pipeline-size.hf.space?__theme=dark"
-        width="850"
-        height="1600"
-    ></iframe>
-</div>
-
 ### Local pipeline

 To load a diffusion pipeline locally, use [`git-lfs`](https://git-lfs.github.com/) to manually download the checkpoint (in this case, [`runwayml/stable-diffusion-v1-5`](https://huggingface.co/runwayml/stable-diffusion-v1-5)) to your local disk. This creates a local folder, `./stable-diffusion-v1-5`, on your disk:
@@ -21,7 +21,7 @@ This guide will show you how to use SVD to generate short videos from images.
 Before you begin, make sure you have the following libraries installed:

 ```py
-!pip install -q -U diffusers transformers accelerate
+!pip install -q -U diffusers transformers accelerate 
 ```

 The are two variants of this model, [SVD](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid) and [SVD-XT](https://huggingface.co/stabilityai/stable-video-diffusion-img2vid-xt). The SVD checkpoint is trained to generate 14 frames and the SVD-XT checkpoint is further finetuned to generate 25 frames.
@@ -86,7 +86,7 @@ Video generation is very memory intensive because you're essentially generating
 + frames = pipe(image, decode_chunk_size=2, generator=generator, num_frames=25).frames[0]
 ```

-Using all these tricks together should lower the memory requirement to less than 8GB VRAM.
+Using all these tricks togethere should lower the memory requirement to less than 8GB VRAM.

 ## Micro-conditioning

@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.

 # 메모리와 속도

-메모리 또는 속도에 대해 🤗 Diffusers *추론*을 최적화하기 위한 몇 가지 기술과 아이디어를 제시합니다.
+메모리 또는 속도에 대해 🤗 Diffusers *추론*을 최적화하기 위한 몇 가지 기술과 아이디어를 제시합니다. 
 일반적으로, memory-efficient attention을 위해 [xFormers](https://github.com/facebookresearch/xformers) 사용을 추천하기 때문에, 추천하는 [설치 방법](xformers)을 보고 설치해 보세요.

 다음 설정이 성능과 메모리에 미치는 영향에 대해 설명합니다.
@@ -27,7 +27,7 @@ specific language governing permissions and limitations under the License.
 | memory-efficient attention | 2.63s  | x3.61   |

 <em>
-   NVIDIA TITAN RTX에서 50 DDIM 스텝의 "a photo of an astronaut riding a horse on mars" 프롬프트로 512x512 크기의 단일 이미지를 생성하였습니다.
+   NVIDIA TITAN RTX에서 50 DDIM 스텝의 "a photo of an astronaut riding a horse on mars" 프롬프트로 512x512 크기의 단일 이미지를 생성하였습니다. 
 </em>

 ## cuDNN auto-tuner 활성화하기
@@ -44,11 +44,11 @@ torch.backends.cudnn.benchmark = True

 ### fp32 대신 tf32 사용하기  (Ampere 및 이후 CUDA 장치들에서)

-Ampere 및 이후 CUDA 장치에서 행렬곱 및 컨볼루션은 TensorFloat32(TF32) 모드를 사용하여 더 빠르지만 약간 덜 정확할 수 있습니다.
-기본적으로 PyTorch는 컨볼루션에 대해 TF32 모드를 활성화하지만 행렬 곱셈은 활성화하지 않습니다.
-네트워크에 완전한 float32 정밀도가 필요한 경우가 아니면 행렬 곱셈에 대해서도 이 설정을 활성화하는 것이 좋습니다.
-이는 일반적으로 무시할 수 있는 수치의 정확도 손실이 있지만, 계산 속도를 크게 높일 수 있습니다.
-그것에 대해 [여기](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32)서 더 읽을 수 있습니다.
+Ampere 및 이후 CUDA 장치에서 행렬곱 및 컨볼루션은 TensorFloat32(TF32) 모드를 사용하여 더 빠르지만 약간 덜 정확할 수 있습니다. 
+기본적으로 PyTorch는 컨볼루션에 대해 TF32 모드를 활성화하지만 행렬 곱셈은 활성화하지 않습니다. 
+네트워크에 완전한 float32 정밀도가 필요한 경우가 아니면 행렬 곱셈에 대해서도 이 설정을 활성화하는 것이 좋습니다. 
+이는 일반적으로 무시할 수 있는 수치의 정확도 손실이 있지만, 계산 속도를 크게 높일 수 있습니다. 
+그것에 대해 [여기](https://huggingface.co/docs/transformers/v4.18.0/en/performance#tf32)서 더 읽을 수 있습니다. 
 추론하기 전에 다음을 추가하기만 하면 됩니다:

 ```python
@@ -59,13 +59,13 @@ torch.backends.cuda.matmul.allow_tf32 = True

 ## 반정밀도 가중치

-더 많은 GPU 메모리를 절약하고 더 빠른 속도를 얻기 위해 모델 가중치를 반정밀도(half precision)로 직접 불러오고 실행할 수 있습니다.
+더 많은 GPU 메모리를 절약하고 더 빠른 속도를 얻기 위해 모델 가중치를 반정밀도(half precision)로 직접 불러오고 실행할 수 있습니다. 
 여기에는 `fp16`이라는 브랜치에 저장된 float16 버전의 가중치를 불러오고, 그 때 `float16` 유형을 사용하도록 PyTorch에 지시하는 작업이 포함됩니다.

 ```Python
 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-
+    
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -75,7 +75,7 @@ image = pipe(prompt).images[0]
 ```

 <Tip warning={true}>
-  어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다.
+  어떤 파이프라인에서도 [`torch.autocast`](https://pytorch.org/docs/stable/amp.html#torch.autocast) 를 사용하는 것은 검은색 이미지를 생성할 수 있고, 순수한 float16 정밀도를 사용하는 것보다 항상 느리기 때문에 사용하지 않는 것이 좋습니다. 
 </Tip>

 ## 추가 메모리 절약을 위한 슬라이스 어텐션
@@ -95,7 +95,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-
+    
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -122,7 +122,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-
+    
    torch_dtype=torch.float16,
 )
 pipe = pipe.to("cuda")
@@ -148,7 +148,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-
+    
    torch_dtype=torch.float16,
 )

@@ -165,7 +165,7 @@ image = pipe(prompt).images[0]
 또 다른 최적화 방법인 <a href="#model_offloading">모델 오프로딩</a>을 사용하는 것을 고려하십시오. 이는 훨씬 빠르지만 메모리 절약이 크지는 않습니다.
 </Tip>

-또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다.
+또한 ttention slicing과 연결해서 최소 메모리(< 2GB)로도 동작할 수 있습니다. 


 ```Python
@@ -174,7 +174,7 @@ from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
    "runwayml/stable-diffusion-v1-5",
-
+    
    torch_dtype=torch.float16,
 )

@@ -204,7 +204,7 @@ import torch
 from diffusers import StableDiffusionPipeline

 pipe = StableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
+    "runwayml/stable-diffusion-v1-5",  
    torch_dtype=torch.float16,
 )

@@ -355,7 +355,7 @@ unet_traced = torch.jit.load("unet_traced.pt")
 class TracedUNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
-        self.in_channels = pipe.unet.config.in_channels
+        self.in_channels = pipe.unet.in_channels
        self.device = pipe.unet.device

    def forward(self, latent_model_input, t, encoder_hidden_states):
@@ -387,7 +387,7 @@ with torch.inference_mode():
 | A100-SXM4-40GB    	| 18.6it/s            	| 29.it/s                        	|
 | A100-SXM-80GB    	| 18.7it/s            	| 29.5it/s                        	|

-이를 활용하려면 다음을 만족해야 합니다:
+이를 활용하려면 다음을 만족해야 합니다: 
 - PyTorch > 1.12
 - Cuda 사용 가능
 - [xformers 라이브러리를 설치함](xformers)
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.

 [[open-in-colab]]

-🧨 Diffusers는 사용자 친화적이며 유연한 도구 상자로, 사용사례에 맞게 diffusion 시스템을 구축 할 수 있도록 설계되었습니다. 이 도구 상자의 핵심은 모델과 스케줄러입니다. [`DiffusionPipeline`]은 편의를 위해 이러한 구성 요소를 번들로 제공하지만, 파이프라인을 분리하고 모델과 스케줄러를 개별적으로 사용해 새로운 diffusion 시스템을 만들 수도 있습니다.
+🧨 Diffusers는 사용자 친화적이며 유연한 도구 상자로, 사용사례에 맞게 diffusion 시스템을 구축 할 수 있도록 설계되었습니다. 이 도구 상자의 핵심은 모델과 스케줄러입니다. [`DiffusionPipeline`]은 편의를 위해 이러한 구성 요소를 번들로 제공하지만, 파이프라인을 분리하고 모델과 스케줄러를 개별적으로 사용해 새로운 diffusion 시스템을 만들 수도 있습니다. 

 이 튜토리얼에서는 기본 파이프라인부터 시작해 Stable Diffusion 파이프라인까지 진행하며 모델과 스케줄러를 사용해 추론을 위한 diffusion 시스템을 조립하는 방법을 배웁니다.

@@ -36,7 +36,7 @@ specific language governing permissions and limitations under the License.

 정말 쉽습니다. 그런데 파이프라인은 어떻게 이렇게 할 수 있었을까요? 파이프라인을 세분화하여 내부에서 어떤 일이 일어나고 있는지 살펴보겠습니다.

-위 예시에서 파이프라인에는 [`UNet2DModel`] 모델과 [`DDPMScheduler`]가 포함되어 있습니다. 파이프라인은 원하는 출력 크기의 랜덤 노이즈를 받아 모델을 여러번 통과시켜 이미지의 노이즈를 제거합니다. 각 timestep에서 모델은 *noise residual*을 예측하고 스케줄러는 이를 사용하여 노이즈가 적은 이미지를 예측합니다. 파이프라인은 지정된 추론 스텝수에 도달할 때까지 이 과정을 반복합니다.
+위 예시에서 파이프라인에는 [`UNet2DModel`] 모델과 [`DDPMScheduler`]가 포함되어 있습니다. 파이프라인은 원하는 출력 크기의 랜덤 노이즈를 받아 모델을 여러번 통과시켜 이미지의 노이즈를 제거합니다. 각 timestep에서 모델은 *noise residual*을 예측하고 스케줄러는 이를 사용하여 노이즈가 적은 이미지를 예측합니다. 파이프라인은 지정된 추론 스텝수에 도달할 때까지 이 과정을 반복합니다. 

 모델과 스케줄러를 별도로 사용하여 파이프라인을 다시 생성하기 위해 자체적인 노이즈 제거 프로세스를 작성해 보겠습니다.

@@ -210,7 +210,7 @@ Stable Diffusion 은 text-to-image *latent diffusion* 모델입니다. latent di

 ```py
 >>> latents = torch.randn(
-...     (batch_size, unet.config.in_channels, height // 8, width // 8),
+...     (batch_size, unet.in_channels, height // 8, width // 8),
 ...     generator=generator,
 ...     device=torch_device,
 ... )
@@ -1,12 +1,10 @@
-# Community Pipeline Examples
+# Community Examples

 > **For more information about community pipelines, please have a look at [this issue](https://github.com/huggingface/diffusers/issues/841).**

-**Community pipeline** examples consist pipelines that have been added by the community.
-Please have a look at the following tables to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out.
-If a community pipeline doesn't work as expected, please open an issue and ping the author on it.
-
-Please also check out our [Community Scripts](https://github.com/huggingface/diffusers/blob/main/examples/community/README_community_scripts.md) examples for tips and tricks that you can use with diffusers without having to run a community pipeline.
+**Community** examples consist of both inference and training examples that have been added by the community.
+Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste ready code example that you can try out.
+If a community doesn't work as expected, please open an issue and ping the author on it.

 | Example                                                                                                                               | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | Code Example                                                                              | Colab                                                                                                                                                                                                              |                                                        Author |
 |:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
@@ -1889,7 +1887,7 @@ In the above code, the `prompt2` is appended to the `prompt`, which is more than

 For more results, checkout [PR #6114](https://github.com/huggingface/diffusers/pull/6114).

-### Example Images Mixing (with CoCa)
+## Example Images Mixing (with CoCa)
 ```python
 import requests
 from io import BytesIO
@@ -2936,7 +2934,7 @@ pipe(prompt =prompt, rp_args = rp_args)

 The Pipeline supports `compel` syntax. Input prompts using the `compel` structure will be automatically applied and processed.

-### Diffusion Posterior Sampling Pipeline
+## Diffusion Posterior Sampling Pipeline
 * Reference paper
    ```
    @article{chung2022diffusion,
@@ -1,232 +0,0 @@
-# Community Scripts
-
-**Community scripts** consist of inference examples using Diffusers pipelines that have been added by the community. 
-Please have a look at the following table to get an overview of all community examples. Click on the **Code Example** to get a copy-and-paste code example that you can try out.
-If a community script doesn't work as expected, please open an issue and ping the author on it.
-
-| Example                                                                                                                               | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | Code Example                                                                              | Colab                                                                                                                                                                                                              |                                                        Author |
-|:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
-| Using IP-Adapter with negative noise                                                                                                  | Using negative noise with IP-adapter to better control the generation (see the [original post](https://github.com/huggingface/diffusers/discussions/7167) on the forum for more details)                                                                                                                                                                                                                                                    | [IP-Adapter Negative Noise](#ip-adapter-negative-noise)                                   | | [Álvaro Somoza](https://github.com/asomoza)|
-| asymmetric tiling                                                                                                  |configure seamless image tiling independently for the X and Y axes                                                                                                                                                                                                      | [Asymmetric Tiling](#asymmetric-tiling )                                   | | [alexisrolland](https://github.com/alexisrolland)|
-
-
-## Example usages
-
-### IP Adapter Negative Noise
-
-Diffusers pipelines are fully integrated with IP-Adapter, which allows you to prompt the diffusion model with an image. However, it does not support negative image prompts (there is no `negative_ip_adapter_image` argument) the same way it supports negative text prompts. When you pass an `ip_adapter_image,` it will create a zero-filled tensor as a negative image. This script shows you how to create a negative noise from `ip_adapter_image` and use it to significantly improve the generation quality while preserving the composition of images.
-
-[cubiq](https://github.com/cubiq) initially developed this feature in his [repository](https://github.com/cubiq/ComfyUI_IPAdapter_plus). The community script was contributed by [asomoza](https://github.com/Somoza). You can find more details about this experimentation [this discussion](https://github.com/huggingface/diffusers/discussions/7167)
-
-IP-Adapter without negative noise
-|source|result|
-|---|---|
-|![20240229150812](https://github.com/huggingface/diffusers/assets/5442875/901d8bd8-7a59-4fe7-bda1-a0e0d6c7dffd)|![20240229163923_normal](https://github.com/huggingface/diffusers/assets/5442875/3432e25a-ece6-45f4-a3f4-fca354f40b5b)|
-
-IP-Adapter with negative noise
-|source|result|
-|---|---|
-|![20240229150812](https://github.com/huggingface/diffusers/assets/5442875/901d8bd8-7a59-4fe7-bda1-a0e0d6c7dffd)|![20240229163923](https://github.com/huggingface/diffusers/assets/5442875/736fd15a-36ba-40c0-a7d8-6ec1ac26f788)|
-
-```python
-import torch
-
-from diffusers import AutoencoderKL, DPMSolverMultistepScheduler, StableDiffusionXLPipeline
-from diffusers.models import ImageProjection
-from diffusers.utils import load_image
-
-
-def encode_image(
-    image_encoder,
-    feature_extractor,
-    image,
-    device,
-    num_images_per_prompt,
-    output_hidden_states=None,
-    negative_image=None,
-):
-    dtype = next(image_encoder.parameters()).dtype
-
-    if not isinstance(image, torch.Tensor):
-        image = feature_extractor(image, return_tensors="pt").pixel_values
-
-    image = image.to(device=device, dtype=dtype)
-    if output_hidden_states:
-        image_enc_hidden_states = image_encoder(image, output_hidden_states=True).hidden_states[-2]
-        image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-
-        if negative_image is None:
-            uncond_image_enc_hidden_states = image_encoder(
-                torch.zeros_like(image), output_hidden_states=True
-            ).hidden_states[-2]
-        else:
-            if not isinstance(negative_image, torch.Tensor):
-                negative_image = feature_extractor(negative_image, return_tensors="pt").pixel_values
-            negative_image = negative_image.to(device=device, dtype=dtype)
-            uncond_image_enc_hidden_states = image_encoder(negative_image, output_hidden_states=True).hidden_states[-2]
-
-        uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0)
-        return image_enc_hidden_states, uncond_image_enc_hidden_states
-    else:
-        image_embeds = image_encoder(image).image_embeds
-        image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
-        uncond_image_embeds = torch.zeros_like(image_embeds)
-
-        return image_embeds, uncond_image_embeds
-
-
-@torch.no_grad()
-def prepare_ip_adapter_image_embeds(
-    unet,
-    image_encoder,
-    feature_extractor,
-    ip_adapter_image,
-    do_classifier_free_guidance,
-    device,
-    num_images_per_prompt,
-    ip_adapter_negative_image=None,
-):
-    if not isinstance(ip_adapter_image, list):
-        ip_adapter_image = [ip_adapter_image]
-
-    if len(ip_adapter_image) != len(unet.encoder_hid_proj.image_projection_layers):
-        raise ValueError(
-            f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(unet.encoder_hid_proj.image_projection_layers)} IP Adapters."
-        )
-
-    image_embeds = []
-    for single_ip_adapter_image, image_proj_layer in zip(
-        ip_adapter_image, unet.encoder_hid_proj.image_projection_layers
-    ):
-        output_hidden_state = not isinstance(image_proj_layer, ImageProjection)
-        single_image_embeds, single_negative_image_embeds = encode_image(
-            image_encoder,
-            feature_extractor,
-            single_ip_adapter_image,
-            device,
-            1,
-            output_hidden_state,
-            negative_image=ip_adapter_negative_image,
-        )
-        single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0)
-        single_negative_image_embeds = torch.stack([single_negative_image_embeds] * num_images_per_prompt, dim=0)
-
-        if do_classifier_free_guidance:
-            single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds])
-            single_image_embeds = single_image_embeds.to(device)
-
-        image_embeds.append(single_image_embeds)
-
-    return image_embeds
-
-
-vae = AutoencoderKL.from_pretrained(
-    "madebyollin/sdxl-vae-fp16-fix",
-    torch_dtype=torch.float16,
-).to("cuda")
-
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "RunDiffusion/Juggernaut-XL-v9",
-    torch_dtype=torch.float16,
-    vae=vae,
-    variant="fp16",
-).to("cuda")
-
-pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
-pipeline.scheduler.config.use_karras_sigmas = True
-
-pipeline.load_ip_adapter(
-    "h94/IP-Adapter",
-    subfolder="sdxl_models",
-    weight_name="ip-adapter-plus_sdxl_vit-h.safetensors",
-    image_encoder_folder="models/image_encoder",
-)
-pipeline.set_ip_adapter_scale(0.7)
-
-ip_image = load_image("source.png")
-negative_ip_image = load_image("noise.png")
-
-image_embeds = prepare_ip_adapter_image_embeds(
-    unet=pipeline.unet,
-    image_encoder=pipeline.image_encoder,
-    feature_extractor=pipeline.feature_extractor,
-    ip_adapter_image=[[ip_image]],
-    do_classifier_free_guidance=True,
-    device="cuda",
-    num_images_per_prompt=1,
-    ip_adapter_negative_image=negative_ip_image,
-)
-
-
-prompt = "cinematic photo of a cyborg in the city, 4k, high quality, intricate, highly detailed"
-negative_prompt = "blurry, smooth, plastic"
-
-image = pipeline(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    ip_adapter_image_embeds=image_embeds,
-    guidance_scale=6.0,
-    num_inference_steps=25,
-    generator=torch.Generator(device="cpu").manual_seed(1556265306),
-).images[0]
-
-image.save("result.png")
-```
-
-### Asymmetric Tiling
-Stable Diffusion is not trained to generate seamless textures. However, you can use this simple script to add tiling to your generation. This script is contributed by [alexisrolland](https://github.com/alexisrolland). See more details in the [this issue](https://github.com/huggingface/diffusers/issues/556)
-
-
-|Generated|Tiled|
-|---|---|
-|![20240313003235_573631814](https://github.com/huggingface/diffusers/assets/5442875/eca174fb-06a4-464e-a3a7-00dbb024543e)|![wall](https://github.com/huggingface/diffusers/assets/5442875/b4aa774b-2a6a-4316-a8eb-8f30b5f4d024)|
-
-
-```py
-import torch
-from typing import Optional
-from diffusers import StableDiffusionPipeline
-from diffusers.models.lora import LoRACompatibleConv
-
-def seamless_tiling(pipeline, x_axis, y_axis):
-    def asymmetric_conv2d_convforward(self, input: torch.Tensor, weight: torch.Tensor, bias: Optional[torch.Tensor] = None):
-        self.paddingX = (self._reversed_padding_repeated_twice[0], self._reversed_padding_repeated_twice[1], 0, 0)
-        self.paddingY = (0, 0, self._reversed_padding_repeated_twice[2], self._reversed_padding_repeated_twice[3])
-        working = torch.nn.functional.pad(input, self.paddingX, mode=x_mode)
-        working = torch.nn.functional.pad(working, self.paddingY, mode=y_mode)
-        return torch.nn.functional.conv2d(working, weight, bias, self.stride, torch.nn.modules.utils._pair(0), self.dilation, self.groups)
-    x_mode = 'circular' if x_axis else 'constant'
-    y_mode = 'circular' if y_axis else 'constant'
-    targets = [pipeline.vae, pipeline.text_encoder, pipeline.unet]
-    convolution_layers = []
-    for target in targets:
-        for module in target.modules():
-            if isinstance(module, torch.nn.Conv2d):
-                convolution_layers.append(module)
-    for layer in convolution_layers:
-        if isinstance(layer, LoRACompatibleConv) and layer.lora_layer is None:
-            layer.lora_layer = lambda * x: 0
-        layer._conv_forward = asymmetric_conv2d_convforward.__get__(layer, torch.nn.Conv2d)
-    return pipeline
-
-pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True)
-pipeline.enable_model_cpu_offload()
-prompt = ["texture of a red brick wall"]
-seed = 123456
-generator = torch.Generator(device='cuda').manual_seed(seed)
-
-pipeline = seamless_tiling(pipeline=pipeline, x_axis=True, y_axis=True)
-image = pipeline(
-    prompt=prompt,
-    width=512,
-    height=512,
-    num_inference_steps=20,
-    guidance_scale=7,
-    num_images_per_prompt=1,
-    generator=generator
-).images[0]
-seamless_tiling(pipeline=pipeline, x_axis=False, y_axis=False)
-
-torch.cuda.empty_cache()
-image.save('image.png')
-```
@@ -1,8 +1,7 @@
 """
-modeled after the textual_inversion.py / train_dreambooth.py and the work
-of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb
+    modeled after the textual_inversion.py / train_dreambooth.py and the work
+    of justinpinkney here: https://github.com/justinpinkney/stable-diffusion/blob/main/notebooks/imagic.ipynb
 """
-
 import inspect
 import warnings
 from typing import List, Optional, Union
@@ -440,7 +440,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -348,7 +348,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -206,7 +206,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
            dimensions: ``batch x channels x height x width``.
    """

-    # checkpoint. #TODO(Yiyi) - need to clean this up later
+    # checkpoint. TOD(Yiyi) - need to clean this up later
    if image is None:
        raise ValueError("`image` input cannot be undefined.")

@@ -277,7 +277,7 @@ def prepare_mask_and_masked_image(image, mask, height, width, return_image: bool
        # images are in latent space and thus can't
        # be masked set masked_image to None
        # we assume that the checkpoint is not an inpainting
-        # checkpoint. #TODO(Yiyi) - need to clean this up later
+        # checkpoint. TOD(Yiyi) - need to clean this up later
        masked_image = None
    else:
        masked_image = image * (mask < 0.5)
@@ -81,7 +81,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -1,7 +1,6 @@
 """
-modified based on diffusion library from Huggingface: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+    modified based on diffusion library from Huggingface: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
 """
-
 import inspect
 from typing import Callable, List, Optional, Union

@@ -224,7 +224,7 @@ class StableDiffusionIPEXPipeline(
        # 5. Prepare latent variables
        latents = self.prepare_latents(
            batch_size * num_images_per_prompt,
-            self.unet.config.in_channels,
+            self.unet.in_channels,
            height,
            width,
            prompt_embeds.dtype,
@@ -679,7 +679,7 @@ class StableDiffusionIPEXPipeline(
        timesteps = self.scheduler.timesteps

        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
+        num_channels_latents = self.unet.in_channels
        latents = self.prepare_latents(
            batch_size * num_images_per_prompt,
            num_channels_latents,
@@ -917,7 +917,7 @@ class TensorRTStableDiffusionPipeline(StableDiffusionPipeline):
            text_embeddings = self.__encode_prompt(prompt, negative_prompt)

            # Pre-initialize latents
-            num_channels_latents = self.unet.config.in_channels
+            num_channels_latents = self.unet.in_channels
            latents = self.prepare_latents(
                batch_size,
                num_channels_latents,
@@ -35,6 +35,7 @@ def slerp(val, low, high):


 class UnCLIPTextInterpolationPipeline(DiffusionPipeline):
+
    """
    Pipeline for prompt-to-prompt interpolation on CLIP text embeddings and using the UnCLIP / Dall-E to decode them to images.

@@ -48,7 +49,7 @@ class UnCLIPTextInterpolationPipeline(DiffusionPipeline):
            Tokenizer of class
            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
        prior ([`PriorTransformer`]):
-            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
        text_proj ([`UnCLIPTextProjModel`]):
            Utility class to prepare and combine the embeddings before they are passed to the decoder.
        decoder ([`UNet2DConditionModel`]):
@@ -23,7 +23,6 @@ TODO:
 6. Integrate to training x
 7. Test
 """
-
 import copy
 import random

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for stable diffusion checkpoints which _only_ contain a controlnet."""
+""" Conversion script for stable diffusion checkpoints which _only_ contain a controlnet. """

 import argparse
 import re
@@ -911,7 +911,6 @@ def main(args):
        )
        precomputed_dataset = precomputed_dataset.with_transform(preprocess_train)

-    del compute_vae_encodings_fn, compute_embeddings_fn, text_encoder_one, text_encoder_two
    del text_encoders, tokenizers, vae
    gc.collect()
    torch.cuda.empty_cache()
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the LDM checkpoints."""
+""" Conversion script for the LDM checkpoints. """

 import argparse
 import json
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the LDM checkpoints."""
+""" Conversion script for the LDM checkpoints. """

 import argparse

@@ -1195,9 +1195,9 @@ def superres_check_against_original(dump_path, unet_checkpoint_path):
        if_II_model = IFStageIII(device="cuda", dir_or_name=orig_path, model_kwargs={"precision": "fp32"}).model

    batch_size = 1
-    channels = model.config.in_channels // 2
-    height = model.config.sample_size
-    width = model.config.sample_size
+    channels = model.in_channels // 2
+    height = model.sample_size
+    width = model.sample_size
    height = 1024
    width = 1024

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the LDM checkpoints."""
+""" Conversion script for the LDM checkpoints. """

 import argparse
 import json
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-"""Conversion script for the LoRA's safetensors checkpoints."""
+""" Conversion script for the LoRA's safetensors checkpoints. """

 import argparse

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the LDM checkpoints."""
+""" Conversion script for the LDM checkpoints. """

 import argparse

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the NCSNPP checkpoints."""
+""" Conversion script for the NCSNPP checkpoints. """

 import argparse
 import json
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the AudioLDM2 checkpoints."""
+""" Conversion script for the AudioLDM2 checkpoints."""

 import argparse
 import re
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the AudioLDM checkpoints."""
+""" Conversion script for the AudioLDM checkpoints."""

 import argparse
 import re
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for stable diffusion checkpoints which _only_ contain a controlnet."""
+""" Conversion script for stable diffusion checkpoints which _only_ contain a controlnet. """

 import argparse

@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the MusicLDM checkpoints."""
+""" Conversion script for the MusicLDM checkpoints."""

 import argparse
 import re
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the LDM checkpoints."""
+""" Conversion script for the LDM checkpoints. """

 import argparse
 import importlib
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the Versatile Stable Diffusion checkpoints."""
+""" Conversion script for the Versatile Stable Diffusion checkpoints. """

 import argparse
 from argparse import Namespace
@@ -11,7 +11,6 @@ $ python convert_zero123_to_diffusers.py \
   --original_config_file /path/zero123/configs/sd-objaverse-finetune-c_concat-256.yaml
 ```
 """
-
 import argparse

 import torch
@@ -13,8 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""ConfigMixin base class and utilities."""
-
+""" ConfigMixin base class and utilities."""
 import dataclasses
 import functools
 import importlib
@@ -19,7 +19,7 @@ import torch
 from huggingface_hub.utils import validate_hf_hub_args
 from safetensors import safe_open

-from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_state_dict
+from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT
 from ..utils import (
    _get_model_file,
    is_accelerate_available,
@@ -182,7 +182,7 @@ class IPAdapterMixin:
                            elif key.startswith("ip_adapter."):
                                state_dict["ip_adapter"][key.replace("ip_adapter.", "")] = f.get_tensor(key)
                else:
-                    state_dict = load_state_dict(model_file)
+                    state_dict = torch.load(model_file, map_location="cpu")
            else:
                state_dict = pretrained_model_name_or_path_or_dict

@@ -25,7 +25,7 @@ from packaging import version
 from torch import nn

 from .. import __version__
-from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_state_dict
+from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT
 from ..utils import (
    USE_PEFT_BACKEND,
    _get_model_file,
@@ -281,7 +281,7 @@ class LoraLoaderMixin:
                    subfolder=subfolder,
                    user_agent=user_agent,
                )
-                state_dict = load_state_dict(model_file)
+                state_dict = torch.load(model_file, map_location="cpu")
        else:
            state_dict = pretrained_model_name_or_path_or_dict

@@ -198,13 +198,8 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_
                unet_state_dict[diffusers_name] = state_dict.pop(key)
                unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)

-        elif lora_name.startswith(("lora_te_", "lora_te1_", "lora_te2_")):
-            if lora_name.startswith(("lora_te_", "lora_te1_")):
-                key_to_replace = "lora_te_" if lora_name.startswith("lora_te_") else "lora_te1_"
-            else:
-                key_to_replace = "lora_te2_"
-
-            diffusers_name = key.replace(key_to_replace, "").replace("_", ".")
+        elif lora_name.startswith("lora_te_"):
+            diffusers_name = key.replace("lora_te_", "").replace("_", ".")
            diffusers_name = diffusers_name.replace("text.model", "text_model")
            diffusers_name = diffusers_name.replace("self.attn", "self_attn")
            diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
@@ -212,22 +207,52 @@ def _convert_kohya_lora_to_diffusers(state_dict, unet_name="unet", text_encoder_
            diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
            diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
            if "self_attn" in diffusers_name:
-                if lora_name.startswith(("lora_te_", "lora_te1_")):
-                    te_state_dict[diffusers_name] = state_dict.pop(key)
-                    te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-                else:
-                    te2_state_dict[diffusers_name] = state_dict.pop(key)
-                    te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+                te_state_dict[diffusers_name] = state_dict.pop(key)
+                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
            elif "mlp" in diffusers_name:
                # Be aware that this is the new diffusers convention and the rest of the code might
                # not utilize it yet.
                diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
-                if lora_name.startswith(("lora_te_", "lora_te1_")):
-                    te_state_dict[diffusers_name] = state_dict.pop(key)
-                    te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
-                else:
-                    te2_state_dict[diffusers_name] = state_dict.pop(key)
-                    te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+                te_state_dict[diffusers_name] = state_dict.pop(key)
+                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+
+        # (sayakpaul): Duplicate code. Needs to be cleaned.
+        elif lora_name.startswith("lora_te1_"):
+            diffusers_name = key.replace("lora_te1_", "").replace("_", ".")
+            diffusers_name = diffusers_name.replace("text.model", "text_model")
+            diffusers_name = diffusers_name.replace("self.attn", "self_attn")
+            diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
+            diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
+            diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
+            diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
+            if "self_attn" in diffusers_name:
+                te_state_dict[diffusers_name] = state_dict.pop(key)
+                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+            elif "mlp" in diffusers_name:
+                # Be aware that this is the new diffusers convention and the rest of the code might
+                # not utilize it yet.
+                diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
+                te_state_dict[diffusers_name] = state_dict.pop(key)
+                te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+
+        # (sayakpaul): Duplicate code. Needs to be cleaned.
+        elif lora_name.startswith("lora_te2_"):
+            diffusers_name = key.replace("lora_te2_", "").replace("_", ".")
+            diffusers_name = diffusers_name.replace("text.model", "text_model")
+            diffusers_name = diffusers_name.replace("self.attn", "self_attn")
+            diffusers_name = diffusers_name.replace("q.proj.lora", "to_q_lora")
+            diffusers_name = diffusers_name.replace("k.proj.lora", "to_k_lora")
+            diffusers_name = diffusers_name.replace("v.proj.lora", "to_v_lora")
+            diffusers_name = diffusers_name.replace("out.proj.lora", "to_out_lora")
+            if "self_attn" in diffusers_name:
+                te2_state_dict[diffusers_name] = state_dict.pop(key)
+                te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)
+            elif "mlp" in diffusers_name:
+                # Be aware that this is the new diffusers convention and the rest of the code might
+                # not utilize it yet.
+                diffusers_name = diffusers_name.replace(".lora.", ".lora_linear_layer.")
+                te2_state_dict[diffusers_name] = state_dict.pop(key)
+                te2_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict.pop(lora_name_up)

        # Rename the alphas so that they can be mapped appropriately.
        if lora_name_alpha in state_dict:
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the Stable Diffusion checkpoints."""
+""" Conversion script for the Stable Diffusion checkpoints."""

 import os
 import re
@@ -18,7 +18,6 @@ import torch
 from huggingface_hub.utils import validate_hf_hub_args
 from torch import nn

-from ..models.modeling_utils import load_state_dict
 from ..utils import _get_model_file, is_accelerate_available, is_transformers_available, logging


@@ -101,7 +100,7 @@ def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs)
                    subfolder=subfolder,
                    user_agent=user_agent,
                )
-                state_dict = load_state_dict(model_file)
+                state_dict = torch.load(model_file, map_location="cpu")
        else:
            state_dict = pretrained_model_name_or_path

@@ -31,7 +31,7 @@ from ..models.embeddings import (
    IPAdapterPlusImageProjection,
    MultiIPAdapterImageProjection,
 )
-from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta, load_state_dict
+from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, load_model_dict_into_meta
 from ..utils import (
    USE_PEFT_BACKEND,
    _get_model_file,
@@ -214,7 +214,7 @@ class UNet2DConditionLoadersMixin:
                    subfolder=subfolder,
                    user_agent=user_agent,
                )
-                state_dict = load_state_dict(model_file)
+                state_dict = torch.load(model_file, map_location="cpu")
        else:
            state_dict = pretrained_model_name_or_path_or_dict

@@ -424,7 +424,7 @@ class Attention(nn.Module):
        # If doesn't apply LoRA do `add_k_proj` or `add_v_proj`
        is_lora_activated.pop("add_k_proj", None)
        is_lora_activated.pop("add_v_proj", None)
-        # 2. else it is not possible that only some layers have LoRA activated
+        # 2. else it is not posssible that only some layers have LoRA activated
        if not all(is_lora_activated.values()):
            raise ValueError(
                f"Make sure that either all layers or no layers have LoRA activated, but have {is_lora_activated}"
@@ -767,7 +767,18 @@ class AttnProcessor:
        query = attn.to_q(hidden_states)

        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
+            # encoder_hidden_states = hidden_states
+            batch, seq, dim = hidden_states.shape
+            height = width = seq**0.5
+            # reshape to (batch, height, width, dim)
+            encoder_hidden_states = hidden_states.view(batch, height, width, dim)
+            # reshape to (batch, dim, height, width)
+            encoder_hidden_states = encoder_hidden_states.permute(0, 3, 1, 2)
+            encoder_hidden_states = torch.nn.functional.avg_pool2d(hidden_states, kernel_size=4)
+            # reshape to (batch, dim, seq)
+            encoder_hidden_states = encoder_hidden_states.view(batch, dim, -1)
+            # reshape to (batch, seq, dim)
+            encoder_hidden_states = encoder_hidden_states.permute(0, 2, 1)
        elif attn.norm_cross:
            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

@@ -1259,7 +1270,18 @@ class AttnProcessor2_0:
        query = attn.to_q(hidden_states)

        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
+            # encoder_hidden_states = hidden_states
+            batch, seq, dim = hidden_states.shape
+            height = width = seq**0.5
+            # reshape to (batch, height, width, dim)
+            encoder_hidden_states = hidden_states.view(batch, height, width, dim)
+            # reshape to (batch, dim, height, width)
+            encoder_hidden_states = encoder_hidden_states.permute(0, 3, 1, 2)
+            encoder_hidden_states = torch.nn.functional.avg_pool2d(hidden_states, kernel_size=4)
+            # reshape to (batch, dim, seq)
+            encoder_hidden_states = encoder_hidden_states.view(batch, dim, -1)
+            # reshape to (batch, seq, dim)
+            encoder_hidden_states = encoder_hidden_states.permute(0, 2, 1)
        elif attn.norm_cross:
            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)

@@ -2098,7 +2120,7 @@ class LoRAAttnAddedKVProcessor(nn.Module):

 class IPAdapterAttnProcessor(nn.Module):
    r"""
-    Attention processor for Multiple IP-Adapters.
+    Attention processor for Multiple IP-Adapater.

    Args:
        hidden_size (`int`):
@@ -2152,8 +2174,8 @@ class IPAdapterAttnProcessor(nn.Module):
                encoder_hidden_states, ip_hidden_states = encoder_hidden_states
            else:
                deprecation_message = (
-                    "You have passed a tensor as `encoder_hidden_states`. This is deprecated and will be removed in a future release."
-                    " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to suppress this warning."
+                    "You have passed a tensor as `encoder_hidden_states`.This is deprecated and will be removed in a future release."
+                    " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to supress this warning."
                )
                deprecate("encoder_hidden_states not a tuple", "1.0.0", deprecation_message, standard_warn=False)
                end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0]
@@ -2253,7 +2275,7 @@ class IPAdapterAttnProcessor(nn.Module):

 class IPAdapterAttnProcessor2_0(torch.nn.Module):
    r"""
-    Attention processor for IP-Adapter for PyTorch 2.0.
+    Attention processor for IP-Adapater for PyTorch 2.0.

    Args:
        hidden_size (`int`):
@@ -2312,8 +2334,8 @@ class IPAdapterAttnProcessor2_0(torch.nn.Module):
                encoder_hidden_states, ip_hidden_states = encoder_hidden_states
            else:
                deprecation_message = (
-                    "You have passed a tensor as `encoder_hidden_states`. This is deprecated and will be removed in a future release."
-                    " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to suppress this warning."
+                    "You have passed a tensor as `encoder_hidden_states`.This is deprecated and will be removed in a future release."
+                    " Please make sure to update your script to pass `encoder_hidden_states` as a tuple to supress this warning."
                )
                deprecate("encoder_hidden_states not a tuple", "1.0.0", deprecation_message, standard_warn=False)
                end_pos = encoder_hidden_states.shape[1] - self.num_tokens[0]
@@ -281,7 +281,7 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlNetMixin):
        elif encoder_hid_dim_type == "text_image_proj":
            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image_proj"` (Kandinsky 2.1)`
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
            self.encoder_hid_proj = TextImageProjection(
                text_embed_dim=encoder_hid_dim,
                image_embed_dim=cross_attention_dim,
@@ -330,7 +330,7 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlNetMixin):
        elif addition_embed_type == "text_image":
            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image"` (Kandinsky 2.1)`
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
            self.add_embedding = TextImageTimeEmbedding(
                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
            )
@@ -509,9 +509,6 @@ class ControlNetModel(ModelMixin, ConfigMixin, FromOriginalControlNetMixin):
            if controlnet.class_embedding:
                controlnet.class_embedding.load_state_dict(unet.class_embedding.state_dict())

-            if hasattr(controlnet, "add_embedding"):
-                controlnet.add_embedding.load_state_dict(unet.add_embedding.state_dict())
-
            controlnet.down_blocks.load_state_dict(unet.down_blocks.state_dict())
            controlnet.mid_block.load_state_dict(unet.mid_block.state_dict())

@@ -12,8 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch - Flax general utilities."""
-
+""" PyTorch - Flax general utilities."""
 import re

 import jax.numpy as jnp
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""PyTorch - Flax general utilities."""
+""" PyTorch - Flax general utilities."""

 from pickle import UnpicklingError

@@ -20,7 +20,6 @@ import os
 import re
 from collections import OrderedDict
 from functools import partial
-from pathlib import Path
 from typing import Any, Callable, List, Optional, Tuple, Union

 import safetensors
@@ -108,12 +107,7 @@ def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[
        if file_extension == SAFETENSORS_FILE_EXTENSION:
            return safetensors.torch.load_file(checkpoint_file, device="cpu")
        else:
-            weights_only_kwarg = {"weights_only": True} if is_torch_version(">=", "1.13") else {}
-            return torch.load(
-                checkpoint_file,
-                map_location="cpu",
-                **weights_only_kwarg,
-            )
+            return torch.load(checkpoint_file, map_location="cpu")
    except Exception as e:
        try:
            with open(checkpoint_file) as f:
@@ -373,18 +367,18 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
        # Save the model
        if safe_serialization:
            safetensors.torch.save_file(
-                state_dict, Path(save_directory, weights_name).as_posix(), metadata={"format": "pt"}
+                state_dict, os.path.join(save_directory, weights_name), metadata={"format": "pt"}
            )
        else:
-            torch.save(state_dict, Path(save_directory, weights_name).as_posix())
+            torch.save(state_dict, os.path.join(save_directory, weights_name))

-        logger.info(f"Model weights saved in {Path(save_directory, weights_name).as_posix()}")
+        logger.info(f"Model weights saved in {os.path.join(save_directory, weights_name)}")

        if push_to_hub:
            # Create a new empty model card and eventually tag it
            model_card = load_or_create_model_card(repo_id, token=token)
            model_card = populate_model_card(model_card)
-            model_card.save(Path(save_directory, "README.md").as_posix())
+            model_card.save(os.path.join(save_directory, "README.md"))

            self._upload_folder(
                save_directory,
@@ -20,15 +20,15 @@ from .transformers.transformer_temporal import (


 class TransformerTemporalModelOutput(TransformerTemporalModelOutput):
-    deprecation_message = "Importing `TransformerTemporalModelOutput` from `diffusers.models.transformer_temporal` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_temporal import TransformerTemporalModelOutput`, instead."
+    deprecation_message = "Importing `TransformerTemporalModelOutput` from `diffusers.models.transformer_temporal` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.tranformer_temporal import TransformerTemporalModelOutput`, instead."
    deprecate("TransformerTemporalModelOutput", "0.29", deprecation_message)


 class TransformerTemporalModel(TransformerTemporalModel):
-    deprecation_message = "Importing `TransformerTemporalModel` from `diffusers.models.transformer_temporal` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_temporal import TransformerTemporalModel`, instead."
+    deprecation_message = "Importing `TransformerTemporalModel` from `diffusers.models.transformer_temporal` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.tranformer_temporal import TransformerTemporalModel`, instead."
    deprecate("TransformerTemporalModel", "0.29", deprecation_message)


 class TransformerSpatioTemporalModel(TransformerSpatioTemporalModel):
-    deprecation_message = "Importing `TransformerSpatioTemporalModel` from `diffusers.models.transformer_temporal` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.transformer_temporal import TransformerSpatioTemporalModel`, instead."
+    deprecation_message = "Importing `TransformerSpatioTemporalModel` from `diffusers.models.transformer_temporal` is deprecated and this will be removed in a future version. Please use `from diffusers.models.transformers.tranformer_temporal import TransformerSpatioTemporalModel`, instead."
    deprecate("TransformerTemporalModelOutput", "0.29", deprecation_message)
@@ -129,7 +129,7 @@ class Transformer2DModel(ModelMixin, ConfigMixin):
        if norm_type == "layer_norm" and num_embeds_ada_norm is not None:
            deprecation_message = (
                f"The configuration file of this model: {self.__class__} is outdated. `norm_type` is either not set or"
-                " incorrectly set to `'layer_norm'`. Make sure to set `norm_type` to `'ada_norm'` in the config."
+                " incorrectly set to `'layer_norm'`.Make sure to set `norm_type` to `'ada_norm'` in the config."
                " Please make sure to update the config accordingly as leaving `norm_type` might led to incorrect"
                " results in future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it"
                " would be very nice if you could open a Pull request for the `transformer/config.json` file"
@@ -580,7 +580,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
        elif encoder_hid_dim_type == "text_image_proj":
            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image_proj"` (Kandinsky 2.1)`
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
            self.encoder_hid_proj = TextImageProjection(
                text_embed_dim=encoder_hid_dim,
                image_embed_dim=cross_attention_dim,
@@ -660,7 +660,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
        elif addition_embed_type == "text_image":
            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image"` (Kandinsky 2.1)`
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
            self.add_embedding = TextImageTimeEmbedding(
                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
            )
@@ -1010,7 +1010,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
-            # Kandinsky 2.1 - style
+            # Kadinsky 2.1 - style
            if "image_embeds" not in added_cond_kwargs:
                raise ValueError(
                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
@@ -1081,8 +1081,6 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
                A tuple of tensors that if specified are added to the residuals of down unet blocks.
            mid_block_additional_residual: (`torch.Tensor`, *optional*):
                A tensor that if specified is added to the residual of the middle unet block.
-            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
-                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)
            encoder_attention_mask (`torch.Tensor`):
                A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If
                `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias,
@@ -1090,6 +1088,18 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain
                tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttnProcessor`].
+            added_cond_kwargs: (`dict`, *optional*):
+                A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that
+                are passed along to the UNet blocks.
+            down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added to UNet long skip connections from down blocks to up blocks for
+                example from ControlNet side model(s)
+            mid_block_additional_residual (`torch.Tensor`, *optional*):
+                additional residual to be added to UNet mid block output, for example from ControlNet side model
+            down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*):
+                additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s)

        Returns:
            [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
@@ -1175,14 +1185,7 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin,
            cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)}

        # 3. down
-        # we're popping the `scale` instead of getting it because otherwise `scale` will be propagated
-        # to the internal blocks and will raise deprecation warnings. this will be confusing for our users.
-        if cross_attention_kwargs is not None:
-            cross_attention_kwargs = cross_attention_kwargs.copy()
-            lora_scale = cross_attention_kwargs.pop("scale", 1.0)
-        else:
-            lora_scale = 1.0
-
+        lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0
        if USE_PEFT_BACKEND:
            # weight the lora layers by setting `lora_scale` for each PEFT layer
            scale_lora_layers(self, lora_scale)
@@ -91,8 +91,6 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
        cross_attention_dim (`int`, *optional*, defaults to 1024): The dimension of the cross attention features.
        attention_head_dim (`int`, *optional*, defaults to 64): The dimension of the attention heads.
        num_attention_heads (`int`, *optional*): The number of attention heads.
-        time_cond_proj_dim (`int`, *optional*, defaults to `None`):
-            The dimension of `cond_proj` layer in the timestep embedding.
    """

    _supports_gradient_checkpointing = False
@@ -125,7 +123,6 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
        cross_attention_dim: int = 1024,
        attention_head_dim: Union[int, Tuple[int]] = 64,
        num_attention_heads: Optional[Union[int, Tuple[int]]] = None,
-        time_cond_proj_dim: Optional[int] = None,
    ):
        super().__init__()

@@ -177,7 +174,6 @@ class UNet3DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
            timestep_input_dim,
            time_embed_dim,
            act_fn=act_fn,
-            cond_proj_dim=time_cond_proj_dim,
        )

        self.transformer_in = TransformerTemporalModel(
@@ -521,11 +521,9 @@ class StableCascadeUNet(ModelMixin, ConfigMixin, FromOriginalUNetMixin):
                        if isinstance(block, SDCascadeResBlock):
                            skip = level_outputs[i] if k == 0 and i > 0 else None
                            if skip is not None and (x.size(-1) != skip.size(-1) or x.size(-2) != skip.size(-2)):
-                                orig_type = x.dtype
                                x = torch.nn.functional.interpolate(
                                    x.float(), skip.shape[-2:], mode="bilinear", align_corners=True
                                )
-                                x = x.to(orig_type)
                            x = torch.utils.checkpoint.checkpoint(
                                create_custom_forward(block), x, skip, use_reentrant=False
                            )
@@ -549,11 +547,9 @@ class StableCascadeUNet(ModelMixin, ConfigMixin, FromOriginalUNetMixin):
                        if isinstance(block, SDCascadeResBlock):
                            skip = level_outputs[i] if k == 0 and i > 0 else None
                            if skip is not None and (x.size(-1) != skip.size(-1) or x.size(-2) != skip.size(-2)):
-                                orig_type = x.dtype
                                x = torch.nn.functional.interpolate(
                                    x.float(), skip.shape[-2:], mode="bilinear", align_corners=True
                                )
-                                x = x.to(orig_type)
                            x = block(x, skip)
                        elif isinstance(block, SDCascadeAttnBlock):
                            x = block(x, clip)
@@ -46,7 +46,6 @@ from .kandinsky2_2 import (
 from .kandinsky3 import Kandinsky3Img2ImgPipeline, Kandinsky3Pipeline
 from .latent_consistency_models import LatentConsistencyModelImg2ImgPipeline, LatentConsistencyModelPipeline
 from .pixart_alpha import PixArtAlphaPipeline
-from .stable_cascade import StableCascadeCombinedPipeline, StableCascadeDecoderPipeline
 from .stable_diffusion import (
    StableDiffusionImg2ImgPipeline,
    StableDiffusionInpaintPipeline,
@@ -71,7 +70,6 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
        ("stable-diffusion-controlnet", StableDiffusionControlNetPipeline),
        ("stable-diffusion-xl-controlnet", StableDiffusionXLControlNetPipeline),
        ("wuerstchen", WuerstchenCombinedPipeline),
-        ("cascade", StableCascadeCombinedPipeline),
        ("lcm", LatentConsistencyModelPipeline),
        ("pixart", PixArtAlphaPipeline),
    ]
@@ -108,7 +106,6 @@ _AUTO_TEXT2IMAGE_DECODER_PIPELINES_MAPPING = OrderedDict(
        ("kandinsky", KandinskyPipeline),
        ("kandinsky22", KandinskyV22Pipeline),
        ("wuerstchen", WuerstchenDecoderPipeline),
-        ("cascade", StableCascadeDecoderPipeline),
    ]
 )
 _AUTO_IMAGE2IMAGE_DECODER_PIPELINES_MAPPING = OrderedDict(
@@ -1171,7 +1171,7 @@ class StableDiffusionControlNetInpaintPipeline(
                `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
                contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
                the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
-                and contain information irrelevant for inpainting, such as background.
+                and contain information inreleant for inpainging, such as background.
            strength (`float`, *optional*, defaults to 1.0):
                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
@@ -1198,7 +1198,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
                `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
                contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
                the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
-                and contain information irrelevant for inpainting, such as background.
+                and contain information inreleant for inpainging, such as background.
            strength (`float`, *optional*, defaults to 0.9999):
                Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
                between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
@@ -32,7 +32,6 @@ from diffusers.utils.import_utils import is_invisible_watermark_available

 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import (
-    FromSingleFileMixin,
    IPAdapterMixin,
    StableDiffusionXLLoraLoaderMixin,
    TextualInversionLoaderMixin,
@@ -162,7 +161,6 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
    StableDiffusionMixin,
    TextualInversionLoaderMixin,
    StableDiffusionXLLoraLoaderMixin,
-    FromSingleFileMixin,
    IPAdapterMixin,
 ):
    r"""
@@ -613,7 +613,7 @@ class IFImg2ImgPipeline(DiffusionPipeline, LoraLoaderMixin):

            for image_ in image:
                image_ = image_.convert("RGB")
-                image_ = resize(image_, self.unet.config.sample_size)
+                image_ = resize(image_, self.unet.sample_size)
                image_ = np.array(image_)
                image_ = image_.astype(np.float32)
                image_ = image_ / 127.5 - 1
@@ -662,7 +662,7 @@ class IFImg2ImgSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):

            for image_ in image:
                image_ = image_.convert("RGB")
-                image_ = resize(image_, self.unet.config.sample_size)
+                image_ = resize(image_, self.unet.sample_size)
                image_ = np.array(image_)
                image_ = image_.astype(np.float32)
                image_ = image_ / 127.5 - 1
@@ -654,7 +654,7 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin):

            for image_ in image:
                image_ = image_.convert("RGB")
-                image_ = resize(image_, self.unet.config.sample_size)
+                image_ = resize(image_, self.unet.sample_size)
                image_ = np.array(image_)
                image_ = image_.astype(np.float32)
                image_ = image_ / 127.5 - 1
@@ -701,7 +701,7 @@ class IFInpaintingPipeline(DiffusionPipeline, LoraLoaderMixin):

            for mask_image_ in mask_image:
                mask_image_ = mask_image_.convert("L")
-                mask_image_ = resize(mask_image_, self.unet.config.sample_size)
+                mask_image_ = resize(mask_image_, self.unet.sample_size)
                mask_image_ = np.array(mask_image_)
                mask_image_ = mask_image_[None, None, :]
                new_mask_image.append(mask_image_)
@@ -698,7 +698,7 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):

            for image_ in image:
                image_ = image_.convert("RGB")
-                image_ = resize(image_, self.unet.config.sample_size)
+                image_ = resize(image_, self.unet.sample_size)
                image_ = np.array(image_)
                image_ = image_.astype(np.float32)
                image_ = image_ / 127.5 - 1
@@ -778,7 +778,7 @@ class IFInpaintingSuperResolutionPipeline(DiffusionPipeline, LoraLoaderMixin):

            for mask_image_ in mask_image:
                mask_image_ = mask_image_.convert("L")
-                mask_image_ = resize(mask_image_, self.unet.config.sample_size)
+                mask_image_ = resize(mask_image_, self.unet.sample_size)
                mask_image_ = np.array(mask_image_)
                mask_image_ = mask_image_[None, None, :]
                new_mask_image.append(mask_image_)
@@ -528,12 +528,15 @@ class StableDiffusionInpaintPipelineLegacy(
                    f" {negative_prompt_embeds.shape}."
                )

+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
    def get_timesteps(self, num_inference_steps, strength, device):
        # get the original timestep using init_timestep
        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)

        t_start = max(num_inference_steps - init_timestep, 0)
        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)

        return timesteps, num_inference_steps - t_start

@@ -531,7 +531,7 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
        elif encoder_hid_dim_type == "text_image_proj":
            # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much
            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image_proj"` (Kandinsky 2.1)`
+            # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)`
            self.encoder_hid_proj = TextImageProjection(
                text_embed_dim=encoder_hid_dim,
                image_embed_dim=cross_attention_dim,
@@ -591,7 +591,7 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
        elif addition_embed_type == "text_image":
            # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much
            # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use
-            # case when `addition_embed_type == "text_image"` (Kandinsky 2.1)`
+            # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)`
            self.add_embedding = TextImageTimeEmbedding(
                text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim
            )
@@ -1257,7 +1257,7 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
        if self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_proj":
            encoder_hidden_states = self.encoder_hid_proj(encoder_hidden_states)
        elif self.encoder_hid_proj is not None and self.config.encoder_hid_dim_type == "text_image_proj":
-            # Kandinsky 2.1 - style
+            # Kadinsky 2.1 - style
            if "image_embeds" not in added_cond_kwargs:
                raise ValueError(
                    f"{self.__class__} has the config param `encoder_hid_dim_type` set to 'text_image_proj' which requires the keyword argument `image_embeds` to be passed in  `added_conditions`"
@@ -129,7 +129,7 @@ class KandinskyCombinedPipeline(DiffusionPipeline):
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
        prior_prior ([`PriorTransformer`]):
-            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -346,7 +346,7 @@ class KandinskyImg2ImgCombinedPipeline(DiffusionPipeline):
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
        prior_prior ([`PriorTransformer`]):
-            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -586,7 +586,7 @@ class KandinskyInpaintCombinedPipeline(DiffusionPipeline):
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
        prior_prior ([`PriorTransformer`]):
-            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -134,7 +134,7 @@ class KandinskyPriorPipeline(DiffusionPipeline):

    Args:
        prior ([`PriorTransformer`]):
-            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
        image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        text_encoder ([`CLIPTextModelWithProjection`]):
@@ -119,7 +119,7 @@ class KandinskyV22CombinedPipeline(DiffusionPipeline):
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
        prior_prior ([`PriorTransformer`]):
-            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -346,7 +346,7 @@ class KandinskyV22Img2ImgCombinedPipeline(DiffusionPipeline):
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
        prior_prior ([`PriorTransformer`]):
-            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -594,7 +594,7 @@ class KandinskyV22InpaintCombinedPipeline(DiffusionPipeline):
        movq ([`VQModel`]):
            MoVQ Decoder to generate the image from the latents.
        prior_prior ([`PriorTransformer`]):
-            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
        prior_image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        prior_text_encoder ([`CLIPTextModelWithProjection`]):
@@ -90,7 +90,7 @@ class KandinskyV22PriorPipeline(DiffusionPipeline):

    Args:
        prior ([`PriorTransformer`]):
-            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
        image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        text_encoder ([`CLIPTextModelWithProjection`]):
@@ -108,7 +108,7 @@ class KandinskyV22PriorEmb2EmbPipeline(DiffusionPipeline):

    Args:
        prior ([`PriorTransformer`]):
-            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
        image_encoder ([`CLIPVisionModelWithProjection`]):
            Frozen image-encoder.
        text_encoder ([`CLIPTextModelWithProjection`]):
@@ -1382,6 +1382,7 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):

            # Don't download index files of forbidden patterns either
            ignore_patterns = ignore_patterns + [f"{i}.index.*json" for i in ignore_patterns]
+
            re_ignore_pattern = [re.compile(fnmatch.translate(p)) for p in ignore_patterns]
            re_allow_pattern = [re.compile(fnmatch.translate(p)) for p in allow_patterns]

@@ -86,7 +86,7 @@ class ShapEImg2ImgPipeline(DiffusionPipeline):

    Args:
        prior ([`PriorTransformer`]):
-            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
        image_encoder ([`~transformers.CLIPVisionModel`]):
            Frozen image-encoder.
        image_processor ([`~transformers.CLIPImageProcessor`]):
@@ -100,10 +100,8 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
        )
        self.register_to_config(latent_dim_scale=latent_dim_scale)

-    def prepare_latents(
-        self, batch_size, image_embeddings, num_images_per_prompt, dtype, device, generator, latents, scheduler
-    ):
-        _, channels, height, width = image_embeddings.shape
+    def prepare_latents(self, image_embeddings, num_images_per_prompt, dtype, device, generator, latents, scheduler):
+        batch_size, channels, height, width = image_embeddings.shape
        latents_shape = (
            batch_size * num_images_per_prompt,
            4,
@@ -385,19 +383,7 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):
        )
        if isinstance(image_embeddings, list):
            image_embeddings = torch.cat(image_embeddings, dim=0)
-
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # Compute the effective number of images per prompt
-        # We must account for the fact that the image embeddings from the prior can be generated with num_images_per_prompt > 1
-        # This results in a case where a single prompt is associated with multiple image embeddings
-        # Divide the number of image embeddings by the batch size to determine if this is the case.
-        num_images_per_prompt = num_images_per_prompt * (image_embeddings.shape[0] // batch_size)
+        batch_size = image_embeddings.shape[0]

        # 2. Encode caption
        if prompt_embeds is None and negative_prompt_embeds is None:
@@ -431,7 +417,7 @@ class StableCascadeDecoderPipeline(DiffusionPipeline):

        # 5. Prepare latents
        latents = self.prepare_latents(
-            batch_size, image_embeddings, num_images_per_prompt, dtype, device, generator, latents, self.scheduler
+            image_embeddings, num_images_per_prompt, dtype, device, generator, latents, self.scheduler
        )

        # 6. Run denoising loop
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Conversion script for the Stable Diffusion checkpoints."""
+""" Conversion script for the Stable Diffusion checkpoints."""

 import re
 from contextlib import nullcontext
@@ -469,7 +469,7 @@ class OnnxStableDiffusionUpscalePipeline(DiffusionPipeline):

        latents = self.prepare_latents(
            batch_size * num_images_per_prompt,
-            self.config.num_latent_channels,
+            self.num_latent_channels,
            height,
            width,
            latents_dtype,
@@ -498,12 +498,12 @@ class OnnxStableDiffusionUpscalePipeline(DiffusionPipeline):

        # 7. Check that sizes of image and latents match
        num_channels_image = image.shape[1]
-        if self.config.num_latent_channels + num_channels_image != self.config.num_unet_input_channels:
+        if self.num_latent_channels + num_channels_image != self.num_unet_input_channels:
            raise ValueError(
                "Incorrect configuration settings! The config of `pipeline.unet` expects"
-                f" {self.config.num_unet_input_channels} but received `num_channels_latents`: {self.config.num_latent_channels} +"
+                f" {self.num_unet_input_channels} but received `num_channels_latents`: {self.num_latent_channels} +"
                f" `num_channels_image`: {num_channels_image} "
-                f" = {self.config.num_latent_channels + num_channels_image}. Please verify the config of"
+                f" = {self.num_latent_channels + num_channels_image}. Please verify the config of"
                " `pipeline.unet` or your `image` input."
            )

@@ -700,8 +700,8 @@ class StableDiffusionDepth2ImgPipeline(DiffusionPipeline, TextualInversionLoader
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> init_image = Image.open(requests.get(url, stream=True).raw)
        >>> prompt = "two tigers"
-        >>> n_prompt = "bad, deformed, ugly, bad anotomy"
-        >>> image = pipe(prompt=prompt, image=init_image, negative_prompt=n_prompt, strength=0.7).images[0]
+        >>> n_propmt = "bad, deformed, ugly, bad anotomy"
+        >>> image = pipe(prompt=prompt, image=init_image, negative_prompt=n_propmt, strength=0.7).images[0]
        ```

        Returns:
@@ -1026,7 +1026,7 @@ class StableDiffusionInpaintPipeline(
                `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
                contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
                the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
-                and contain information irrelevant for inpainting, such as background.
+                and contain information inreleant for inpainging, such as background.
            strength (`float`, *optional*, defaults to 1.0):
                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
@@ -194,7 +194,7 @@ class StableDiffusionInstructPix2PixPipeline(
                A higher guidance scale value encourages the model to generate images closely linked to the text
                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
            image_guidance_scale (`float`, *optional*, defaults to 1.5):
-                Push the generated image towards the initial `image`. Image guidance scale is enabled by setting
+                Push the generated image towards the inital `image`. Image guidance scale is enabled by setting
                `image_guidance_scale > 1`. Higher image guidance scale encourages generated images that are closely
                linked to the source `image`, usually at the expense of lower image quality. This pipeline requires a
                value of at least `1`.
@@ -76,7 +76,7 @@ class StableUnCLIPPipeline(DiffusionPipeline, StableDiffusionMixin, TextualInver
        prior_text_encoder ([`CLIPTextModelWithProjection`]):
            Frozen [`CLIPTextModelWithProjection`] text-encoder.
        prior ([`PriorTransformer`]):
-            The canonical unCLIP prior to approximate the image embedding from the text embedding.
+            The canonincal unCLIP prior to approximate the image embedding from the text embedding.
        prior_scheduler ([`KarrasDiffusionSchedulers`]):
            Scheduler used in the prior denoising process.
        image_normalizer ([`StableUnCLIPImageNormalizer`]):
@@ -716,12 +716,15 @@ class StableDiffusionDiffEditPipeline(
                    f" `source_negative_prompt_embeds` {source_negative_prompt_embeds.shape}."
                )

+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps
    def get_timesteps(self, num_inference_steps, strength, device):
        # get the original timestep using init_timestep
        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)

        t_start = max(num_inference_steps - init_timestep, 0)
        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)

        return timesteps, num_inference_steps - t_start

@@ -680,7 +680,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
        timesteps = self.scheduler.timesteps

        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
+        num_channels_latents = self.unet.in_channels
        latents = self.prepare_latents(
            batch_size * num_images_per_prompt,
            num_channels_latents,
@@ -713,7 +713,7 @@ class StableDiffusionGLIGENPipeline(DiffusionPipeline, StableDiffusionMixin):
        boxes = torch.zeros(max_objs, 4, device=device, dtype=self.text_encoder.dtype)
        boxes[:n_objs] = torch.tensor(gligen_boxes)
        text_embeddings = torch.zeros(
-            max_objs, self.unet.config.cross_attention_dim, device=device, dtype=self.text_encoder.dtype
+            max_objs, self.unet.cross_attention_dim, device=device, dtype=self.text_encoder.dtype
        )
        text_embeddings[:n_objs] = _text_embeddings
        # Generate a mask for each object that is entity described by phrases
@@ -847,7 +847,7 @@ class StableDiffusionGLIGENTextImagePipeline(DiffusionPipeline, StableDiffusionM
        timesteps = self.scheduler.timesteps

        # 5. Prepare latent variables
-        num_channels_latents = self.unet.config.in_channels
+        num_channels_latents = self.unet.in_channels
        latents = self.prepare_latents(
            batch_size * num_images_per_prompt,
            num_channels_latents,
@@ -1259,7 +1259,7 @@ class StableDiffusionXLInpaintPipeline(
                `padding_mask_crop` is not `None`, it will first find a rectangular region with the same aspect ration of the image and
                contains all masked area, and then expand that area based on `padding_mask_crop`. The image and mask_image will then be cropped based on
                the expanded area before resizing to the original image size for inpainting. This is useful when the masked area is small while the image is large
-                and contain information irrelevant for inpainting, such as background.
+                and contain information inreleant for inpainging, such as background.
            strength (`float`, *optional*, defaults to 0.9999):
                Conceptually, indicates how much to transform the masked portion of the reference `image`. Must be
                between 0 and 1. `image` will be used as a starting point, adding more noise to it the larger the
@@ -659,7 +659,7 @@ class StableDiffusionXLInstructPix2PixPipeline(
                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
                usually at the expense of lower image quality.
            image_guidance_scale (`float`, *optional*, defaults to 1.5):
-                Image guidance scale is to push the generated image towards the initial image `image`. Image guidance
+                Image guidance scale is to push the generated image towards the inital image `image`. Image guidance
                scale is enabled by setting `image_guidance_scale > 1`. Higher image guidance scale encourages to
                generate images that are closely linked to the source image `image`, usually at the expense of lower
                image quality. This pipeline requires a value of at least `1`.
@@ -45,7 +45,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -104,7 +104,7 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
    @property
    def step_index(self):
        """
-        The index counter for current timestep. It will increase 1 after each scheduler step.
+        The index counter for current timestep. It will increae 1 after each scheduler step.
        """
        return self._step_index

@@ -233,7 +233,7 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
        sigmas = self._convert_to_karras(ramp)
        timesteps = self.sigma_to_t(sigmas)

-        sigmas = np.concatenate([sigmas, [self.config.sigma_min]]).astype(np.float32)
+        sigmas = np.concatenate([sigmas, [self.sigma_min]]).astype(np.float32)
        self.sigmas = torch.from_numpy(sigmas).to(device=device)

        if str(device).startswith("mps"):
@@ -434,11 +434,7 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
        if self.begin_index is None:
            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
-        elif self.step_index is not None:
-            # add_noise is called after first denoising step (for inpainting)
-            step_indices = [self.step_index] * timesteps.shape[0]
        else:
-            # add noise is called before first denoising step to create initial latent(img2img)
            step_indices = [self.begin_index] * timesteps.shape[0]

        sigma = sigmas[step_indices].flatten()
@@ -82,7 +82,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -80,7 +80,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -82,7 +82,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -79,7 +79,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -81,7 +81,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -75,7 +75,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -61,7 +61,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -191,7 +191,7 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
    @property
    def step_index(self):
        """
-        The index counter for current timestep. It will increase 1 after each scheduler step.
+        The index counter for current timestep. It will increae 1 after each scheduler step.
        """
        return self._step_index

@@ -768,14 +768,10 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
            schedule_timesteps = self.timesteps.to(original_samples.device)
            timesteps = timesteps.to(original_samples.device)

-        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
+        # begin_index is None when the scheduler is used for training
        if self.begin_index is None:
            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
-        elif self.step_index is not None:
-            # add_noise is called after first denoising step (for inpainting)
-            step_indices = [self.step_index] * timesteps.shape[0]
        else:
-            # add noise is called before first denoising step to create initial latent(img2img)
            step_indices = [self.begin_index] * timesteps.shape[0]

        sigma = sigmas[step_indices].flatten()
@@ -61,7 +61,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -282,7 +282,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
    @property
    def step_index(self):
        """
-        The index counter for current timestep. It will increase 1 after each scheduler step.
+        The index counter for current timestep. It will increae 1 after each scheduler step.
        """
        return self._step_index

@@ -1011,14 +1011,10 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
            schedule_timesteps = self.timesteps.to(original_samples.device)
            timesteps = timesteps.to(original_samples.device)

-        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
+        # begin_index is None when the scheduler is used for training
        if self.begin_index is None:
            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
-        elif self.step_index is not None:
-            # add_noise is called after first denoising step (for inpainting)
-            step_indices = [self.step_index] * timesteps.shape[0]
        else:
-            # add noise is called before first denoising step to create initial latent(img2img)
            step_indices = [self.begin_index] * timesteps.shape[0]

        sigma = sigmas[step_indices].flatten()
@@ -61,7 +61,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -217,7 +217,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
    @property
    def step_index(self):
        """
-        The index counter for current timestep. It will increase 1 after each scheduler step.
+        The index counter for current timestep. It will increae 1 after each scheduler step.
        """
        return self._step_index

@@ -233,7 +233,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
        """
        # Clipping the minimum of all lambda(t) for numerical stability.
        # This is critical for cosine (squaredcos_cap_v2) noise schedule.
-        clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.config.lambda_min_clipped).item()
+        clipped_idx = torch.searchsorted(torch.flip(self.lambda_t, [0]), self.lambda_min_clipped).item()
        self.noisiest_timestep = self.config.num_train_timesteps - 1 - clipped_idx

        # "linspace", "leading", "trailing" corresponds to annotation of Table 2. of https://arxiv.org/abs/2305.08891
@@ -110,7 +110,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -233,7 +233,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
    @property
    def step_index(self):
        """
-        The index counter for current timestep. It will increase 1 after each scheduler step.
+        The index counter for current timestep. It will increae 1 after each scheduler step.
        """
        return self._step_index

@@ -325,7 +325,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
        log_sigmas = np.log(sigmas)
        sigmas = np.interp(timesteps, np.arange(0, len(sigmas)), sigmas)

-        if self.config.use_karras_sigmas:
+        if self.use_karras_sigmas:
            sigmas = self._convert_to_karras(in_sigmas=sigmas)
            timesteps = np.array([self._sigma_to_t(sigma, log_sigmas) for sigma in sigmas])

@@ -543,11 +543,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
        if self.begin_index is None:
            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
-        elif self.step_index is not None:
-            # add_noise is called after first denoising step (for inpainting)
-            step_indices = [self.step_index] * timesteps.shape[0]
        else:
-            # add noise is called before first denoising step to create initial latent(img2img)
            step_indices = [self.begin_index] * timesteps.shape[0]

        sigma = sigmas[step_indices].flatten()
@@ -63,7 +63,7 @@ def betas_for_alpha_bar(
            return math.exp(t * -12.0)

    else:
-        raise ValueError(f"Unsupported alpha_transform_type: {alpha_transform_type}")
+        raise ValueError(f"Unsupported alpha_tranform_type: {alpha_transform_type}")

    betas = []
    for i in range(num_diffusion_timesteps):
@@ -252,7 +252,7 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
    @property
    def step_index(self):
        """
-        The index counter for current timestep. It will increase 1 after each scheduler step.
+        The index counter for current timestep. It will increae 1 after each scheduler step.
        """
        return self._step_index

@@ -961,14 +961,10 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
            schedule_timesteps = self.timesteps.to(original_samples.device)
            timesteps = timesteps.to(original_samples.device)

-        # begin_index is None when the scheduler is used for training or pipeline does not implement set_begin_index
+        # begin_index is None when the scheduler is used for training
        if self.begin_index is None:
            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
-        elif self.step_index is not None:
-            # add_noise is called after first denoising step (for inpainting)
-            step_indices = [self.step_index] * timesteps.shape[0]
        else:
-            # add noise is called before first denoising step to create initial latent(img2img)
            step_indices = [self.begin_index] * timesteps.shape[0]

        sigma = sigmas[step_indices].flatten()
@@ -143,7 +143,7 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
    @property
    def step_index(self):
        """
-        The index counter for current timestep. It will increase 1 after each scheduler step.
+        The index counter for current timestep. It will increae 1 after each scheduler step.
        """
        return self._step_index

@@ -669,11 +669,7 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
        if self.begin_index is None:
            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
-        elif self.step_index is not None:
-            # add_noise is called after first denoising step (for inpainting)
-            step_indices = [self.step_index] * timesteps.shape[0]
        else:
-            # add noise is called before first denoising step to create initial latent(img2img)
            step_indices = [self.begin_index] * timesteps.shape[0]

        sigma = sigmas[step_indices].flatten()
@@ -111,7 +111,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
    @property
    def step_index(self):
        """
-        The index counter for current timestep. It will increase 1 after each scheduler step.
+        The index counter for current timestep. It will increae 1 after each scheduler step.
        """
        return self._step_index

@@ -367,11 +367,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
        # self.begin_index is None when scheduler is used for training, or pipeline does not implement set_begin_index
        if self.begin_index is None:
            step_indices = [self.index_for_timestep(t, schedule_timesteps) for t in timesteps]
-        elif self.step_index is not None:
-            # add_noise is called after first denoising step (for inpainting)
-            step_indices = [self.step_index] * timesteps.shape[0]
        else:
-            # add noise is called before first denoising step to create initial latent(img2img)
            step_indices = [self.begin_index] * timesteps.shape[0]

        sigma = sigmas[step_indices].flatten()
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
patil-suraj	ea238e821b	up	2024-03-18 11:47:47 +01:00
patil-suraj	b6d1d670fc	up	2024-03-18 11:34:17 +01:00