debug

finish
fix SDXL flax init
2023-09-26 16:53:20 +02:00 · 2023-09-26 15:57:10 +02:00 · 2023-09-26 15:54:11 +02:00 · 2023-09-26 15:06:45 +02:00 · 2023-09-26 18:14:52 +05:30 · 2023-09-26 13:45:37 +05:30
192 changed files with 3493 additions and 1114 deletions
@@ -15,6 +15,7 @@ body:
             *The community cannot solve your issue if it cannot reproduce it. If your bug is related to training, add your training script and make everything needed to train public. Otherwise, just add a simple Python code snippet.*
        - 3. Add the **minimum amount of code / context that is needed to understand, reproduce your issue**.
             *Make the life of maintainers easy. `diffusers` is getting many issues every day. Make sure your issue is about one bug and one bug only. Make sure you add only the context, code needed to understand your issues - nothing more. Generally, every issue is a way of documenting this library, try to make it a good documentation entry.*
+        - 4. For issues related to community pipelines (i.e., the pipelines located in the `examples/community` folder), please tag the author of the pipeline in your issue thread as those pipelines are not maintained.
  - type: markdown
    attributes:
      value: |
@@ -70,7 +71,7 @@ body:

        Questions on schedulers: @patrickvonplaten and @williamberman

-        Questions on models and pipelines: @patrickvonplaten, @sayakpaul, and @williamberman
+        Questions on models and pipelines: @patrickvonplaten, @sayakpaul, and @williamberman (for community pipelines, please tag the original author of the pipeline)

        Questions on JAX- and MPS-related things: @pcuenca

@@ -26,6 +26,7 @@ jobs:
        image-name:
          - diffusers-pytorch-cpu
          - diffusers-pytorch-cuda
+          - diffusers-pytorch-compile-cuda
          - diffusers-flax-cpu
          - diffusers-flax-tpu
          - diffusers-onnxruntime-cpu
@@ -0,0 +1,67 @@
+name: Fast tests for PRs - PEFT backend
+
+on:
+  pull_request:
+    branches:
+      - main
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  DIFFUSERS_IS_CI: yes
+  OMP_NUM_THREADS: 4
+  MKL_NUM_THREADS: 4
+  PYTEST_TIMEOUT: 60
+
+jobs:
+  run_fast_tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: LoRA
+            framework: lora
+            runner: docker-cpu
+            image: diffusers/diffusers-pytorch-cpu
+            report: torch_cpu_lora
+
+
+    name: ${{ matrix.config.name }}
+
+    runs-on: ${{ matrix.config.runner }}
+
+    container:
+      image: ${{ matrix.config.image }}
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        apt-get update && apt-get install libsndfile1-dev libgl1 -y
+        python -m pip install -e .[quality,test]
+        python -m pip install git+https://github.com/huggingface/accelerate.git
+        python -m pip install -U git+https://github.com/huggingface/transformers.git
+        python -m pip install -U git+https://github.com/huggingface/peft.git
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
+    - name: Run fast PyTorch LoRA CPU tests with PEFT backend
+      if: ${{ matrix.config.framework == 'lora' }}
+      run: |
+        python -m pytest -n 2 --max-worker-restart=0 --dist=loadfile \
+          -s -v \
+          --make-reports=tests_${{ matrix.config.report }} \
+          tests/lora/test_lora_layers_peft.py
@@ -74,11 +74,11 @@ jobs:
      env:
        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-        CUBLAS_WORKSPACE_CONFIG: :16:8 
+        CUBLAS_WORKSPACE_CONFIG: :16:8

      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-          -s -v -k "not Flax and not Onnx" \
+          -s -v -k "not Flax and not Onnx and not compile" \
          --make-reports=tests_${{ matrix.config.report }} \
          tests/

@@ -113,6 +113,50 @@ jobs:
        name: ${{ matrix.config.report }}_test_reports
        path: reports

+  run_torch_compile_tests:
+    name: PyTorch Compile CUDA tests
+
+    runs-on: docker-gpu
+
+    container:
+      image: diffusers/diffusers-pytorch-compile-cuda
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: NVIDIA-SMI
+      run: |
+        nvidia-smi
+
+    - name: Install dependencies
+      run: |
+        python -m pip install -e .[quality,test,training]
+
+    - name: Environment
+      run: |
+        python utils/print_env.py
+
+    - name: Run example tests on GPU
+      env:
+        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      run: |
+        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: cat reports/tests_torch_compile_cuda_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v2
+      with:
+        name: torch_compile_test_reports
+        path: reports
+
  run_examples_tests:
    name: Examples PyTorch CUDA tests on Ubuntu

@@ -0,0 +1,47 @@
+FROM nvidia/cuda:11.7.1-cudnn8-runtime-ubuntu20.04
+LABEL maintainer="Hugging Face"
+LABEL repository="diffusers"
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt update && \
+    apt install -y bash \
+                   build-essential \
+                   git \
+                   git-lfs \
+                   curl \
+                   ca-certificates \
+                   libsndfile1-dev \
+                   libgl1 \
+                   python3.9 \
+                   python3-pip \
+                   python3.9-venv && \
+    rm -rf /var/lib/apt/lists
+
+# make sure to use venv
+RUN python3 -m venv /opt/venv
+ENV PATH="/opt/venv/bin:$PATH"
+
+# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
+RUN python3 -m pip install --no-cache-dir --upgrade pip && \
+    python3 -m pip install --no-cache-dir \
+        torch \
+        torchvision \
+        torchaudio \
+        invisible_watermark && \
+    python3 -m pip install --no-cache-dir \
+        accelerate \
+        datasets \
+        hf-doc-builder \
+        huggingface-hub \
+        Jinja2 \
+        librosa \
+        numpy \
+        scipy \
+        tensorboard \
+        transformers \
+        omegaconf \
+        pytorch-lightning \
+        xformers
+
+CMD ["/bin/bash"]
@@ -67,30 +67,30 @@ By default, `tqdm` progress bars are displayed during model download. [`logging.

 ## Base setters

-[[autodoc]] logging.set_verbosity_error
+[[autodoc]] utils.logging.set_verbosity_error

-[[autodoc]] logging.set_verbosity_warning
+[[autodoc]] utils.logging.set_verbosity_warning

-[[autodoc]] logging.set_verbosity_info
+[[autodoc]] utils.logging.set_verbosity_info

-[[autodoc]] logging.set_verbosity_debug
+[[autodoc]] utils.logging.set_verbosity_debug

 ## Other functions

-[[autodoc]] logging.get_verbosity
+[[autodoc]] utils.logging.get_verbosity

-[[autodoc]] logging.set_verbosity
+[[autodoc]] utils.logging.set_verbosity

-[[autodoc]] logging.get_logger
+[[autodoc]] utils.logging.get_logger

-[[autodoc]] logging.enable_default_handler
+[[autodoc]] utils.logging.enable_default_handler

-[[autodoc]] logging.disable_default_handler
+[[autodoc]] utils.logging.disable_default_handler

-[[autodoc]] logging.enable_explicit_format
+[[autodoc]] utils.logging.enable_explicit_format

-[[autodoc]] logging.reset_format
+[[autodoc]] utils.logging.reset_format

-[[autodoc]] logging.enable_progress_bar
+[[autodoc]] utils.logging.enable_progress_bar

-[[autodoc]] logging.disable_progress_bar
+[[autodoc]] utils.logging.disable_progress_bar
@@ -34,13 +34,7 @@ Make sure to check out the Schedulers [guide](/using-diffusers/schedulers) to le
 	- load_lora_weights
 	- save_lora_weights

-## StableDiffusionPipelineOutput
-[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
-
 ## StableDiffusionXLInstructPix2PixPipeline
 [[autodoc]] StableDiffusionXLInstructPix2PixPipeline
 	- __call__
 	- all
-
-## StableDiffusionXLPipelineOutput
-[[autodoc]] pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput
@@ -31,5 +31,5 @@ Make sure to check out the Schedulers [guide](/using-diffusers/schedulers) to le
 	- __call__

 ## StableDiffusionSafePipelineOutput
-[[autodoc]] pipelines.semantic_stable_diffusion.SemanticStableDiffusionPipelineOutput
-	- all
+[[autodoc]] pipelines.semantic_stable_diffusion.pipeline_output.SemanticStableDiffusionPipelineOutput
+	- all
@@ -2,7 +2,7 @@

 <img src="https://github.com/dome272/Wuerstchen/assets/61938694/0617c863-165a-43ee-9303-2a17299a0cf9">

-[Würstchen: Efficient Pretraining of Text-to-Image Models](https://huggingface.co/papers/2306.00637) is by Pablo Pernias, Dominic Rampas, and Marc Aubreville.
+[Würstchen: Efficient Pretraining of Text-to-Image Models](https://huggingface.co/papers/2306.00637) is by Pablo Pernias, Dominic Rampas, Mats L. Richter and Christopher Pal and Marc Aubreville.

 The abstract from the paper is:

@@ -134,3 +134,16 @@ The original codebase, as well as experimental ideas, can be found at [dome272/W
 [[autodoc]] WuerstchenDecoderPipeline
 	- all
 	- __call__
+
+## Citation
+
+```bibtex
+      @misc{pernias2023wuerstchen,
+            title={Wuerstchen: Efficient Pretraining of Text-to-Image Models}, 
+            author={Pablo Pernias and Dominic Rampas and Mats L. Richter and Christopher Pal and Marc Aubreville},
+            year={2023},
+            eprint={2306.00637},
+            archivePrefix={arXiv},
+            primaryClass={cs.CV}
+      }
+```
@@ -10,91 +10,597 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Text-guided image-to-image generation
+# Image-to-image

 [[open-in-colab]]

-The [`StableDiffusionImg2ImgPipeline`] lets you pass a text prompt and an initial image to condition the generation of new images.
+Image-to-image is similar to [text-to-image](conditional_image_generation), but in addition to a prompt, you can also pass an initial image as a starting point for the diffusion process. The initial image is encoded to latent space and noise is added to it. Then the latent diffusion model takes a prompt and the noisy latent image, predicts the added noise, and removes the predicted noise from the initial latent image to get the new latent image. Lastly, a decoder decodes the new latent image back into an image.

-Before you begin, make sure you have all the necessary libraries installed:
+With 🤗 Diffusers, this is as easy as 1-2-3:
+
+1. Load a checkpoint into the [`AutoPipelineForImage2Image`] class; this pipeline automatically handles loading the correct pipeline class  based on the checkpoint:

 ```py
-# uncomment to install the necessary libraries in Colab
-#!pip install diffusers transformers ftfy accelerate
+from diffusers import AutoPipelineForImage2Image
+from diffusers.utils import load_image
+
+pipeline = AutoPipelineForImage2Image.from_pretrained(
+    "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_xformers_memory_efficient_attention()
 ```

-Get started by creating a [`StableDiffusionImg2ImgPipeline`] with a pretrained Stable Diffusion model like [`nitrosocke/Ghibli-Diffusion`](https://huggingface.co/nitrosocke/Ghibli-Diffusion).
+<Tip>

-```python
+You'll notice throughout the guide, we use [`~DiffusionPipeline.enable_model_cpu_offload`] and [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`], to save memory and increase inference speed. If you're using PyTorch 2.0, then you don't need to call [`~DiffusionPipeline.enable_xformers_memory_efficient_attention`] on your pipeline because it'll already be using PyTorch 2.0's native [scaled-dot product attention](/optimization/torch2.0#scaled-dot-product-attention).
+
+</Tip>
+
+2. Load an image to pass to the pipeline:
+
+```py
+init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png")
+```
+
+3. Pass a prompt and image to the pipeline to generate an image:
+
+```py
+prompt = "cat wizard, gandalf, lord of the rings, detailed, fantasy, cute, adorable, Pixar, Disney, 8k"
+image = pipeline(prompt, image=init_image).images[0]
+image
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
+  </div>
+</div>
+
+## Popular models
+
+The most popular image-to-image models are [Stable Diffusion v1.5](https://huggingface.co/runwayml/stable-diffusion-v1-5), [Stable Diffusion XL (SDXL)](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0), and [Kandinsky 2.2](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder). The results from the Stable Diffusion and Kandinsky models vary due to their architecture differences and training process; you can generally expect SDXL to produce higher quality images than Stable Diffusion v1.5. Let's take a quick look at how to use each of these models and compare their results.
+
+### Stable Diffusion v1.5
+
+Stable Diffusion v1.5 is a latent diffusion model intialized from an earlier checkpoint, and further finetuned for 595K steps on 512x512 images. To use this pipeline for image-to-image, you'll need to prepare an initial image to pass to the pipeline. Then you can pass a prompt and the image to the pipeline to generate a new image:
+
+```py
 import torch
 import requests
 from PIL import Image
 from io import BytesIO
-from diffusers import StableDiffusionImg2ImgPipeline
+from diffusers import AutoPipelineForImage2Image

-device = "cuda"
-pipe = StableDiffusionImg2ImgPipeline.from_pretrained(
-    "nitrosocke/Ghibli-Diffusion", torch_dtype=torch.float16, use_safetensors=True
-).to(device)
-```
-
-Download and preprocess an initial image so you can pass it to the pipeline:
-
-```python
-url = "https://raw.githubusercontent.com/CompVis/stable-diffusion/main/assets/stable-samples/img2img/sketch-mountains-input.jpg"
+pipeline = AutoPipelineForImage2Image.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_xformers_memory_efficient_attention()

+# prepare image
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
 response = requests.get(url)
 init_image = Image.open(BytesIO(response.content)).convert("RGB")
-init_image.thumbnail((768, 768))
-init_image
+
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+
+# pass prompt and image to pipeline
+image = pipeline(prompt, image=init_image).images[0]
+image
 ```

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/YiYiXu/test-doc-assets/resolve/main/image_2_image_using_diffusers_cell_8_output_0.jpeg"/>
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-sdv1.5.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
+  </div>
 </div>

+### Stable Diffusion XL (SDXL)
+
+SDXL is a more powerful version of the Stable Diffusion model. It uses a larger base model, and an additional refiner model to increase the quality of the base model's output. Read the [SDXL](sdxl) guide for a more detailed walkthrough of how to use this model, and other techniques it uses to produce high quality images.
+
+```py
+import torch
+import requests
+from PIL import Image
+from io import BytesIO
+from diffusers import AutoPipelineForImage2Image
+
+pipeline = AutoPipelineForImage2Image.from_pretrained(
+    "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_xformers_memory_efficient_attention()
+
+# prepare image
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-sdxl-init.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+
+# pass prompt and image to pipeline
+image = pipeline(prompt, image=init_image, strength=).images[0]
+image
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-sdxl-init.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-sdxl.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
+  </div>
+</div>
+
+### Kandinsky 2.2
+
+The Kandinsky model is different from the Stable Diffusion models because it uses an image prior model to create image embeddings. The embeddings help create a better alignment between text and images, allowing the latent diffusion model to generate better images.
+
+The simplest way to use Kandinsky 2.2 is:
+
+```py
+import torch
+import requests
+from PIL import Image
+from io import BytesIO
+from diffusers import AutoPipelineForImage2Image
+
+pipeline = AutoPipelineForImage2Image.from_pretrained(
+    "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_xformers_memory_efficient_attention()
+
+# prepare image
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+
+# pass prompt and image to pipeline
+image = pipeline(prompt, image=init_image).images[0]
+image
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-kandinsky.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
+  </div>
+</div>
+
+## Configure pipeline parameters
+
+There are several important parameters you can configure in the pipeline that'll affect the image generation process and image quality. Let's take a closer look at what these parameters do and how changing them affects the output.
+
+### Strength
+
+`strength` is one of the most important parameters to consider and it'll have a huge impact on your generated image. It determines how much the generated image resembles the initial image. In other words:
+
+- 📈 a higher `strength` value gives the model more "creativity" to generate an image that's different from the initial image; a `strength` value of 1.0 means the initial image is more or less ignored
+- 📉 a lower `strength` value means the generated image is more similar to the initial image
+
+The `strength` and `num_inference_steps` parameter are related because `strength` determines the number of noise steps to add. For example, if the `num_inference_steps` is 50 and `strength` is 0.8, then this means adding 40 (50 * 0.8) steps of noise to the initial image and then denoising for 40 steps to get the newly generated image.
+
+```py
+import torch
+import requests
+from PIL import Image
+from io import BytesIO
+from diffusers import AutoPipelineForImage2Image
+
+pipeline = AutoPipelineForImage2Image.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_xformers_memory_efficient_attention()
+
+# prepare image
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+image = init_image
+
+# pass prompt and image to pipeline
+image = pipeline(prompt, image=init_image, strength=0.8).images[0]
+image
+```
+
+<div class="flex flex-row gap-4">
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-strength-0.4.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">strength = 0.4</figcaption>
+  </div>
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-strength-0.6.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">strength = 0.6</figcaption>
+  </div>
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-strength-1.0.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">strength = 1.0</figcaption>
+  </div>
+</div>
+
+### Guidance scale
+
+The `guidance_scale` parameter is used to control how closely aligned the generated image and text prompt are. A higher `guidance_scale` value means your generated image is more aligned with the prompt, while a lower `guidance_scale` value means your generated image has more space to deviate from the prompt.
+
+You can combine `guidance_scale` with `strength` for even more precise control over how expressive the model is. For example, combine a high `strength + guidance_scale` for maximum creativity or use a combination of low `strength` and low `guidance_scale` to generate an image that resembles the initial image but is not as strictly bound to the prompt.
+
+```py
+import torch
+import requests
+from PIL import Image
+from io import BytesIO
+from diffusers import AutoPipelineForImage2Image
+
+pipeline = AutoPipelineForImage2Image.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_xformers_memory_efficient_attention()
+
+# prepare image
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+
+# pass prompt and image to pipeline
+image = pipeline(prompt, image=init_image, guidance_scale=8.0).images[0]
+image
+```
+
+<div class="flex flex-row gap-4">
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-guidance-0.1.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">guidance_scale = 0.1</figcaption>
+  </div>
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-guidance-3.0.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">guidance_scale = 5.0</figcaption>
+  </div>
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-guidance-7.5.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">guidance_scale = 10.0</figcaption>
+  </div>
+</div>
+
+### Negative prompt
+
+A negative prompt conditions the model to *not* include things in an image, and it can be used to improve image quality or modify an image. For example, you can improve image quality by including negative prompts like "poor details" or "blurry" to encourage the model to generate a higher quality image. Or you can modify an image by specifying things to exclude from an image.
+
+```py
+import torch
+import requests
+from PIL import Image
+from io import BytesIO
+from diffusers import AutoPipelineForImage2Image
+
+pipeline = AutoPipelineForImage2Image.from_pretrained(
+    "stabilityai/stable-diffusion-xl-refiner-1.0", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_xformers_memory_efficient_attention()
+
+# prepare image
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+negative_prompt = "ugly, deformed, disfigured, poor details, bad anatomy"
+
+# pass prompt and image to pipeline
+image = pipeline(prompt, negative_prompt=negative_prompt, image=init_image).images[0]
+image
+```
+
+<div class="flex flex-row gap-4">
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-negative-1.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">negative prompt = "ugly, deformed, disfigured, poor details, bad anatomy"</figcaption>
+  </div>
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-negative-2.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">negative prompt = "jungle"</figcaption>
+  </div>
+</div>
+
+## Chained image-to-image pipelines
+
+There are some other interesting ways you can use an image-to-image pipeline aside from just generating an image (although that is pretty cool too). You can take it a step further and chain it with other pipelines.
+
+### Text-to-image-to-image
+
+Chaining a text-to-image and image-to-image pipeline allows you to generate an image from text and use the generated image as the initial image for the image-to-image pipeline. This is useful if you want to generate an image entirely from scratch. For example, let's chain a Stable Diffusion and a Kandinsky model.
+
+Start by generating an image with the text-to-image pipeline:
+
+```py
+from diffusers import AutoPipelineForText2Image, AutoPipelineForImage2Image
+import torch
+
+pipeline = AutoPipelineForText2Image.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_xformers_memory_efficient_attention()
+
+image = pipeline("Astronaut in a jungle, cold color palette, muted colors, detailed, 8k").images[0]
+```
+
+Now you can pass this generated image to the image-to-image pipeline:
+
+```py
+pipeline = AutoPipelineForImage2Image.from_pretrained(
+    "kandinsky-community/kandinsky-2-2-decoder", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_xformers_memory_efficient_attention()
+
+image = pipeline("Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", image=image).images[0]
+image
+```
+
+### Image-to-image-to-image
+
+You can also chain multiple image-to-image pipelines together to create more interesting images. This can be useful for iteratively performing style transfer on an image, generate short GIFs, restore color to an image, or restore missing areas of an image.
+
+Start by generating an image:
+
+```py
+import torch
+import requests
+from PIL import Image
+from io import BytesIO
+from diffusers import AutoPipelineForImage2Image
+
+pipeline = AutoPipelineForImage2Image.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_xformers_memory_efficient_attention()
+
+# prepare image
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+
+# pass prompt and image to pipeline
+image = pipeline(prompt, image=init_image, output_type="latent").images[0]
+```
+
 <Tip>

-💡 `strength` is a value between 0.0 and 1.0 that controls the amount of noise added to the input image. Values that approach 1.0 allow for lots of variations but will also produce images that are not semantically consistent with the input.
+It is important to specify `output_type="latent"` in the pipeline to keep all the outputs in latent space to avoid an unnecessary decode-encode step. This only works if the chained pipelines are using the same VAE.

 </Tip>

-Define the prompt (for this checkpoint finetuned on Ghibli-style art, you need to prefix the prompt with the `ghibli style` tokens) and run the pipeline:
+Pass the latent output from this pipeline to the next pipeline to generate an image in a [comic book art style](https://huggingface.co/ogkalu/Comic-Diffusion):

-```python
-prompt = "ghibli style, a fantasy landscape with castles"
-generator = torch.Generator(device=device).manual_seed(1024)
-image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5, generator=generator).images[0]
+```py
+pipelne = AutoPipelineForImage2Image.from_pretrained(
+    "ogkalu/Comic-Diffusion", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_xformers_memory_efficient_attention()
+
+# need to include the token "charliebo artstyle" in the prompt to use this checkpoint
+image = pipeline("Astronaut in a jungle, charliebo artstyle", image=image, output_type="latent").images[0]
+```
+
+Repeat one more time to generate the final image in a [pixel art style](https://huggingface.co/kohbanye/pixel-art-style):
+
+```py
+pipeline = AutoPipelineForImage2Image.from_pretrained(
+    "kohbanye/pixel-art-style", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_xformers_memory_efficient_attention()
+
+# need to include the token "pixelartstyle" in the prompt to use this checkpoint
+image = pipeline("Astronaut in a jungle, pixelartstyle", image=image).images[0]
+image
+```
+
+### Image-to-upscaler-to-super-resolution
+
+Another way you can chain your image-to-image pipeline is with an upscaler and super-resolution pipeline to really increase the level of details in an image.
+
+Start with an image-to-image pipeline:
+
+```py
+import torch
+import requests
+from PIL import Image
+from io import BytesIO
+from diffusers import AutoPipelineForImage2Image
+
+pipeline = AutoPipelineForImage2Image.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_xformers_memory_efficient_attention()
+
+# prepare image
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+
+# pass prompt and image to pipeline
+image_1 = pipeline(prompt, image=init_image, output_type="latent").images[0]
+```
+
+<Tip>
+
+It is important to specify `output_type="latent"` in the pipeline to keep all the outputs in *latent* space to avoid an unnecessary decode-encode step. This only works if the chained pipelines are using the same VAE.
+
+</Tip>
+
+Chain it to an upscaler pipeline to increase the image resolution:
+
+```py
+upscaler = AutoPipelineForImage2Image.from_pretrained(
+    "stabilityai/sd-x2-latent-upscaler", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+upscaler.enable_model_cpu_offload()
+upscaler.enable_xformers_memory_efficient_attention()
+
+image_2 = upscaler(prompt, image=image_1, output_type="latent").images[0]
+```
+
+Finally, chain it to a super-resolution pipeline to further enhance the resolution:
+
+```py
+super_res = AutoPipelineForImage2Image.from_pretrained(
+    "stabilityai/stable-diffusion-x4-upscaler", torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+super_res.enable_model_cpu_offload()
+super_res.enable_xformers_memory_efficient_attention()
+
+image_3 = upscaler(prompt, image=image_2).images[0]
+image_3
+```
+
+## Control image generation
+
+Trying to generate an image that looks exactly the way you want can be difficult, which is why controlled generation techniques and models are so useful. While you can use the `negative_prompt` to partially control image generation, there are more robust methods like prompt weighting and ControlNets.
+
+### Prompt weighting
+
+Prompt weighting allows you to scale the representation of each concept in a prompt. For example, in a prompt like "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", you can choose to increase or decrease the embeddings of "astronaut" and "jungle". The [Compel](https://github.com/damian0815/compel) library provides a simple syntax for adjusting prompt weights and generating the embeddings. You can learn how to create the embeddings in the [Prompt weighting](weighted_prompts) guide.
+
+[`AutoPipelineForImage2Image`] has a `prompt_embeds` (and `negative_prompt_embeds` if you're using a negative prompt) parameter where you can pass the embeddings which replaces the `prompt` parameter.
+
+```py
+from diffusers import AutoPipelineForImage2Image
+import torch
+
+pipeline = AutoPipelineForImage2Image.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
+).to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_xformers_memory_efficient_attention()
+
+image = pipeline(prompt_emebds=prompt_embeds, # generated from Compel
+    negative_prompt_embeds, # generated from Compel
+    image=init_image,
+).images[0]
+```
+
+### ControlNet
+
+ControlNets provide a more flexible and accurate way to control image generation because you can use an additional conditioning image. The conditioning image can be a canny image, depth map, image segmentation, and even scribbles! Whatever type of conditioning image you choose, the ControlNet generates an image that preserves the information in it.
+
+For example, let's condition an image with a depth map to keep the spatial information in the image.
+
+```py
+# prepare image
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"
+response = requests.get(url)
+init_image = Image.open(BytesIO(response.content)).convert("RGB")
+init_image = init_image.resize((958, 960)) # resize to depth image dimensions
+depth_image = load_image("https://huggingface.co/lllyasviel/control_v11f1p_sd15_depth/resolve/main/images/control.png")
+```
+
+Load a ControlNet model conditioned on depth maps and the [`AutoPipelineForImage2Image`]:
+
+```py
+from diffusers import ControlNetModel, AutoPipelineForImage2Image
+from diffusers.utils import load_image
+import torch
+
+controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11f1p_sd15_depth", torch_dtype=torch.float16, variant="fp16", use_safetensors=True)
+pipeline = AutoPipelineForImage2Image.from_pretrained(
+    "runwayml/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16, variant="fp16", use_safetensors=True
+).to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_xformers_memory_efficient_attention()
+```
+
+Now generate a new image conditioned on the depth map, initial image, and prompt:
+
+```py
+prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
+image = pipeline(prompt, image=init_image, control_image=depth_image).images[0]
+image
+```
+
+<div class="flex flex-row gap-4">
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">initial image</figcaption>
+  </div>
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/lllyasviel/control_v11f1p_sd15_depth/resolve/main/images/control.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">depth image</figcaption>
+  </div>
+  <div class="flex-1">
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-controlnet.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">ControlNet image</figcaption>
+  </div>
+</div>
+
+Let's apply a new [style](https://huggingface.co/nitrosocke/elden-ring-diffusion) to the image generated from the ControlNet by chaining it with an image-to-image pipeline:
+
+```py
+pipeline = AutoPipelineForImage2Image.from_pretrained(
+    "nitrosocke/elden-ring-diffusion", torch_dtype=torch.float16,
+).to("cuda")
+pipeline.enable_model_cpu_offload()
+pipeline.enable_xformers_memory_efficient_attention()
+
+prompt = "elden ring style astronaut in a jungle" # include the token "elden ring style" in the prompt
+negative_prompt = "ugly, deformed, disfigured, poor details, bad anatomy"
+
+image = pipeline(prompt, negative_prompt=negative_prompt, image=init_image, strength=0.45, guidance_scale=10.5).images[0]
 image
 ```

 <div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ghibli-castles.png"/>
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-elden-ring.png">
 </div>

-You can also try experimenting with a different scheduler to see how that affects the output:
+## Optimize

-```python
-from diffusers import LMSDiscreteScheduler
+Running diffusion models is computationally expensive and intensive, but with a few optimization tricks, it is entirely possible to run them on consumer and free-tier GPUs. For example, you can use a more memory-efficient form of attention such as PyTorch 2.0's [scaled-dot product attention](optimization/torch2.0#scaled-dot-product-attention) or [xFormers](optimization/xformers) (you can use one or the other, but there's no need to use both). You can also offload the model to the GPU while the other pipeline components wait on the CPU.

-lms = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
-pipe.scheduler = lms
-generator = torch.Generator(device=device).manual_seed(1024)
-image = pipe(prompt=prompt, image=init_image, strength=0.75, guidance_scale=7.5, generator=generator).images[0]
-image
+```diff
+ pipeline.enable_model_cpu_offload()
+ pipeline.enable_xformers_memory_efficient_attention()
 ```

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lms-ghibli.png"/>
-</div>
+With [`torch.compile`](optimization/torch2.0#torch.compile), you can boost your inference speed even more by wrapping your UNet with it:

-Check out the Spaces below, and try generating images with different values for `strength`. You'll notice that using lower values for `strength` produces images that are more similar to the original image.
+```py
+pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+```

-Feel free to also switch the scheduler to the [`LMSDiscreteScheduler`] and see how that affects the output.
-
-<iframe
-	src="https://stevhliu-ghibli-img2img.hf.space"
-	frameborder="0"
-	width="850"
-	height="500"
-></iframe>
+To learn more, take a look at the [Reduce memory usage](optimization/memory) and [Torch 2.0](optimization/torch2.0) guides.
@@ -1,19 +0,0 @@
-# What is safetensors ? 
-
-[safetensors](https://github.com/huggingface/safetensors) is a different format
-from the classic `.bin` which uses Pytorch which uses pickle.
-
-Pickle is notoriously unsafe which allow any malicious file to execute arbitrary code.
-The hub itself tries to prevent issues from it, but it's not a silver bullet.
-
-`safetensors` first and foremost goal is to make loading machine learning models *safe*
-in the sense that no takeover of your computer can be done.
-
-# Why use safetensors ?
-
-**Safety** can be one reason, if you're attempting to use a not well known model and
-you're not sure about the source of the file.
-
-And a secondary reason, is **the speed of loading**. Safetensors can load models much faster
-than regular pickle files. If you spend a lot of times switching models, this can be
-a huge timesave.
@@ -3,7 +3,7 @@ import inspect
 from typing import Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from torch.nn import functional as F
 from torchvision import transforms
@@ -2,7 +2,7 @@ import inspect
 from typing import List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from torch import nn
 from torch.nn import functional as F
@@ -14,7 +14,7 @@

 from typing import List, Optional, Tuple, Union

-import PIL
+import PIL.Image
 import torch
 from torchvision import transforms

@@ -7,7 +7,7 @@ import warnings
 from typing import List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 import torch.nn.functional as F
 from accelerate import Accelerator
@@ -2,7 +2,7 @@ import inspect
 from typing import Callable, List, Optional, Tuple, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

@@ -3,7 +3,7 @@ import re
 from typing import Any, Callable, Dict, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
@@ -3,7 +3,7 @@ import re
 from typing import Callable, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPTokenizer
@@ -1029,7 +1029,7 @@ class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, Lo
                Guidance rescale factor should fix overexposure when using zero terminal SNR.
            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
-                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
                explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
@@ -1039,7 +1039,7 @@ class SDXLLongPromptWeightingPipeline(DiffusionPipeline, FromSingleFileMixin, Lo
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
                For most cases, `target_size` should be set to the desired height and width of the generated image. If
-                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).

        Examples:
@@ -1,7 +1,7 @@
 from typing import Any, Callable, Dict, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch

 from diffusers import StableDiffusionImg2ImgPipeline
@@ -6,7 +6,7 @@ from typing import Any, Callable, Dict, List, Optional, Union

 import kornia
 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from packaging import version
 from transformers import CLIPFeatureExtractor, CLIPVisionModelWithProjection
@@ -16,7 +16,7 @@ import inspect
 from typing import Callable, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer
@@ -24,7 +24,7 @@ from typing import List, Optional, Union
 import numpy as np
 import onnx
 import onnx_graphsurgeon as gs
-import PIL
+import PIL.Image
 import tensorrt as trt
 import torch
 from huggingface_hub import snapshot_download
@@ -24,7 +24,7 @@ from typing import List, Optional, Union
 import numpy as np
 import onnx
 import onnx_graphsurgeon as gs
-import PIL
+import PIL.Image
 import tensorrt as trt
 import torch
 from huggingface_hub import snapshot_download
@@ -1,6 +1,6 @@
 from typing import Callable, List, Optional, Union

-import PIL
+import PIL.Image
 import torch
 from transformers import (
    CLIPImageProcessor,
@@ -16,7 +16,7 @@ import math
 from typing import Callable, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from PIL import Image
 from transformers import CLIPTextModel, CLIPTokenizer
@@ -1,7 +1,7 @@
 import inspect
 from typing import List, Optional, Union

-import PIL
+import PIL.Image
 import torch
 from torch.nn import functional as F
 from transformers import (
@@ -907,10 +907,17 @@ def main():

            if args.snr_gamma is not None:
                snr = jnp.array(compute_snr(timesteps))
-                snr_loss_weights = jnp.where(snr < args.snr_gamma, snr, jnp.ones_like(snr) * args.snr_gamma) / snr
+                base_weights = jnp.where(snr < args.snr_gamma, snr, jnp.ones_like(snr) * args.snr_gamma) / snr
                if noise_scheduler.config.prediction_type == "v_prediction":
-                    # velocity objective prediction requires SNR weights to be floored to a min value of 1.
-                    snr_loss_weights = snr_loss_weights + 1
+                    snr_loss_weights = base_weights + 1
+                else:
+                    # Epsilon and sample prediction use the base weights.
+                    snr_loss_weights = base_weights
+                # For zero-terminal SNR, we have to handle the case where a sigma of Zero results in a Inf value.
+                # When we run this, the MSE loss weights for this timestep is set unconditionally to 1.
+                # If we do not run this, the loss value will go to NaN almost immediately, usually within one step.
+                snr_loss_weights[snr == 0] = 1.0
+
                loss = loss * snr_loss_weights

            loss = loss.mean()
@@ -801,9 +801,22 @@ def main():
                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
                    # This is discussed in Section 4.2 of the same paper.
                    snr = compute_snr(timesteps)
-                    mse_loss_weights = (
+                    base_weight = (
                        torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
                    )
+
+                    if noise_scheduler.config.prediction_type == "v_prediction":
+                        # Velocity objective needs to be floored to an SNR weight of one.
+                        mse_loss_weights = base_weight + 1
+                    else:
+                        # Epsilon and sample both use the same loss weights.
+                        mse_loss_weights = base_weight
+
+                    # For zero-terminal SNR, we have to handle the case where a sigma of Zero results in a Inf value.
+                    # When we run this, the MSE loss weights for this timestep is set unconditionally to 1.
+                    # If we do not run this, the loss value will go to NaN almost immediately, usually within one step.
+                    mse_loss_weights[snr == 0] = 1.0
+
                    # We first calculate the original loss. Then we mean over the non-batch dimensions and
                    # rebalance the sample-wise losses with their respective loss weights.
                    # Finally, we take the mean of the rebalanced loss.
@@ -654,9 +654,22 @@ def main():
                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
                    # This is discussed in Section 4.2 of the same paper.
                    snr = compute_snr(timesteps)
-                    mse_loss_weights = (
+                    base_weight = (
                        torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
                    )
+
+                    if noise_scheduler.config.prediction_type == "v_prediction":
+                        # Velocity objective needs to be floored to an SNR weight of one.
+                        mse_loss_weights = base_weight + 1
+                    else:
+                        # Epsilon and sample both use the same loss weights.
+                        mse_loss_weights = base_weight
+
+                    # For zero-terminal SNR, we have to handle the case where a sigma of Zero results in a Inf value.
+                    # When we run this, the MSE loss weights for this timestep is set unconditionally to 1.
+                    # If we do not run this, the loss value will go to NaN almost immediately, usually within one step.
+                    mse_loss_weights[snr == 0] = 1.0
+
                    # We first calculate the original loss. Then we mean over the non-batch dimensions and
                    # rebalance the sample-wise losses with their respective loss weights.
                    # Finally, we take the mean of the rebalanced loss.
@@ -685,9 +685,22 @@ def main():
                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
                    # This is discussed in Section 4.2 of the same paper.
                    snr = compute_snr(timesteps)
-                    mse_loss_weights = (
+                    base_weight = (
                        torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
                    )
+
+                    if noise_scheduler.config.prediction_type == "v_prediction":
+                        # Velocity objective needs to be floored to an SNR weight of one.
+                        mse_loss_weights = base_weight + 1
+                    else:
+                        # Epsilon and sample both use the same loss weights.
+                        mse_loss_weights = base_weight
+
+                    # For zero-terminal SNR, we have to handle the case where a sigma of Zero results in a Inf value.
+                    # When we run this, the MSE loss weights for this timestep is set unconditionally to 1.
+                    # If we do not run this, the loss value will go to NaN almost immediately, usually within one step.
+                    mse_loss_weights[snr == 0] = 1.0
+
                    # We first calculate the original loss. Then we mean over the non-batch dimensions and
                    # rebalance the sample-wise losses with their respective loss weights.
                    # Finally, we take the mean of the rebalanced loss.
@@ -833,9 +833,22 @@ def main():
                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
                    # This is discussed in Section 4.2 of the same paper.
                    snr = compute_snr(timesteps)
-                    mse_loss_weights = (
+                    base_weight = (
                        torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
                    )
+
+                    if noise_scheduler.config.prediction_type == "v_prediction":
+                        # Velocity objective needs to be floored to an SNR weight of one.
+                        mse_loss_weights = base_weight + 1
+                    else:
+                        # Epsilon and sample both use the same loss weights.
+                        mse_loss_weights = base_weight
+
+                    # For zero-terminal SNR, we have to handle the case where a sigma of Zero results in a Inf value.
+                    # When we run this, the MSE loss weights for this timestep is set unconditionally to 1.
+                    # If we do not run this, the loss value will go to NaN almost immediately, usually within one step.
+                    mse_loss_weights[snr == 0] = 1.0
+
                    # We first calculate the original loss. Then we mean over the non-batch dimensions and
                    # rebalance the sample-wise losses with their respective loss weights.
                    # Finally, we take the mean of the rebalanced loss.
@@ -872,12 +872,21 @@ def main():
                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
                    # This is discussed in Section 4.2 of the same paper.
                    snr = compute_snr(timesteps)
-                    mse_loss_weights = (
+                    base_weight = (
                        torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
                    )
                    if noise_scheduler.config.prediction_type == "v_prediction":
                        # velocity objective prediction requires SNR weights to be floored to a min value of 1.
-                        mse_loss_weights = mse_loss_weights + 1
+                        mse_loss_weights = base_weight + 1
+                    else:
+                        # Epsilon and sample prediction use the base weights.
+                        mse_loss_weights = base_weight
+
+                    # For zero-terminal SNR, we have to handle the case where a sigma of Zero results in a Inf value.
+                    # When we run this, the MSE loss weights for this timestep is set unconditionally to 1.
+                    # If we do not run this, the loss value will go to NaN almost immediately, usually within one step.
+                    mse_loss_weights[snr == 0] = 1.0
+
                    # We first calculate the original loss. Then we mean over the non-batch dimensions and
                    # rebalance the sample-wise losses with their respective loss weights.
                    # Finally, we take the mean of the rebalanced loss.
@@ -952,12 +952,22 @@ def main():
                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
                    # This is discussed in Section 4.2 of the same paper.
                    snr = compute_snr(timesteps)
-                    mse_loss_weights = (
+                    base_weight = (
                        torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
                    )
+
                    if noise_scheduler.config.prediction_type == "v_prediction":
-                        # velocity objective prediction requires SNR weights to be floored to a min value of 1.
-                        mse_loss_weights = mse_loss_weights + 1
+                        # Velocity objective needs to be floored to an SNR weight of one.
+                        mse_loss_weights = base_weight + 1
+                    else:
+                        # Epsilon and sample both use the same loss weights.
+                        mse_loss_weights = base_weight
+
+                    # For zero-terminal SNR, we have to handle the case where a sigma of Zero results in a Inf value.
+                    # When we run this, the MSE loss weights for this timestep is set unconditionally to 1.
+                    # If we do not run this, the loss value will go to NaN almost immediately, usually within one step.
+                    mse_loss_weights[snr == 0] = 1.0
+
                    # We first calculate the original loss. Then we mean over the non-batch dimensions and
                    # rebalance the sample-wise losses with their respective loss weights.
                    # Finally, we take the mean of the rebalanced loss.
@@ -783,12 +783,22 @@ def main():
                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
                    # This is discussed in Section 4.2 of the same paper.
                    snr = compute_snr(timesteps)
-                    mse_loss_weights = (
+                    base_weight = (
                        torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
                    )
+
                    if noise_scheduler.config.prediction_type == "v_prediction":
-                        # velocity objective prediction requires SNR weights to be floored to a min value of 1.
-                        mse_loss_weights = mse_loss_weights + 1
+                        # Velocity objective needs to be floored to an SNR weight of one.
+                        mse_loss_weights = base_weight + 1
+                    else:
+                        # Epsilon and sample both use the same loss weights.
+                        mse_loss_weights = base_weight
+
+                    # For zero-terminal SNR, we have to handle the case where a sigma of Zero results in a Inf value.
+                    # When we run this, the MSE loss weights for this timestep is set unconditionally to 1.
+                    # If we do not run this, the loss value will go to NaN almost immediately, usually within one step.
+                    mse_loss_weights[snr == 0] = 1.0
+
                    # We first calculate the original loss. Then we mean over the non-batch dimensions and
                    # rebalance the sample-wise losses with their respective loss weights.
                    # Finally, we take the mean of the rebalanced loss.
@@ -1072,12 +1072,22 @@ def main(args):
                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
                    # This is discussed in Section 4.2 of the same paper.
                    snr = compute_snr(timesteps)
-                    mse_loss_weights = (
+                    base_weight = (
                        torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
                    )
+
                    if noise_scheduler.config.prediction_type == "v_prediction":
-                        # velocity objective prediction requires SNR weights to be floored to a min value of 1.
-                        mse_loss_weights = mse_loss_weights + 1
+                        # Velocity objective needs to be floored to an SNR weight of one.
+                        mse_loss_weights = base_weight + 1
+                    else:
+                        # Epsilon and sample both use the same loss weights.
+                        mse_loss_weights = base_weight
+
+                    # For zero-terminal SNR, we have to handle the case where a sigma of Zero results in a Inf value.
+                    # When we run this, the MSE loss weights for this timestep is set unconditionally to 1.
+                    # If we do not run this, the loss value will go to NaN almost immediately, usually within one step.
+                    mse_loss_weights[snr == 0] = 1.0
+
                    # We first calculate the original loss. Then we mean over the non-batch dimensions and
                    # rebalance the sample-wise losses with their respective loss weights.
                    # Finally, we take the mean of the rebalanced loss.
@@ -325,6 +325,55 @@ def parse_args(input_args=None):
    parser.add_argument(
        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
    )
+    parser.add_argument(
+        "--timestep_bias_strategy",
+        type=str,
+        default="none",
+        choices=["earlier", "later", "range", "none"],
+        help=(
+            "The timestep bias strategy, which may help direct the model toward learning low or high frequency details."
+            " Choices: ['earlier', 'later', 'range', 'none']."
+            " The default is 'none', which means no bias is applied, and training proceeds normally."
+            " The value of 'later' will increase the frequency of the model's final training timesteps."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_bias_multiplier",
+        type=float,
+        default=1.0,
+        help=(
+            "The multiplier for the bias. Defaults to 1.0, which means no bias is applied."
+            " A value of 2.0 will double the weight of the bias, and a value of 0.5 will halve it."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_bias_begin",
+        type=int,
+        default=0,
+        help=(
+            "When using `--timestep_bias_strategy=range`, the beginning (inclusive) timestep to bias."
+            " Defaults to zero, which equates to having no specific bias."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_bias_end",
+        type=int,
+        default=1000,
+        help=(
+            "When using `--timestep_bias_strategy=range`, the final timestep (inclusive) to bias."
+            " Defaults to 1000, which is the number of timesteps that Stable Diffusion is trained on."
+        ),
+    )
+    parser.add_argument(
+        "--timestep_bias_portion",
+        type=float,
+        default=0.25,
+        help=(
+            "The portion of timesteps to bias. Defaults to 0.25, which 25% of timesteps will be biased."
+            " A value of 0.5 will bias one half of the timesteps. The value provided for `--timestep_bias_strategy` determines"
+            " whether the biased portions are in the earlier or later timesteps."
+        ),
+    )
    parser.add_argument(
        "--snr_gamma",
        type=float,
@@ -479,6 +528,47 @@ def compute_vae_encodings(batch, vae):
    return {"model_input": model_input.cpu()}


+def generate_timestep_weights(args, num_timesteps):
+    weights = torch.ones(num_timesteps)
+
+    # Determine the indices to bias
+    num_to_bias = int(args.timestep_bias_portion * num_timesteps)
+
+    if args.timestep_bias_strategy == "later":
+        bias_indices = slice(-num_to_bias, None)
+    elif args.timestep_bias_strategy == "earlier":
+        bias_indices = slice(0, num_to_bias)
+    elif args.timestep_bias_strategy == "range":
+        # Out of the possible 1000 timesteps, we might want to focus on eg. 200-500.
+        range_begin = args.timestep_bias_begin
+        range_end = args.timestep_bias_end
+        if range_begin < 0:
+            raise ValueError(
+                "When using the range strategy for timestep bias, you must provide a beginning timestep greater or equal to zero."
+            )
+        if range_end > num_timesteps:
+            raise ValueError(
+                "When using the range strategy for timestep bias, you must provide an ending timestep smaller than the number of timesteps."
+            )
+        bias_indices = slice(range_begin, range_end)
+    else:  # 'none' or any other string
+        return weights
+    if args.timestep_bias_multiplier <= 0:
+        return ValueError(
+            "The parameter --timestep_bias_multiplier is not intended to be used to disable the training of specific timesteps."
+            " If it was intended to disable timestep bias, use `--timestep_bias_strategy none` instead."
+            " A timestep bias multiplier less than or equal to 0 is not allowed."
+        )
+
+    # Apply the bias
+    weights[bias_indices] *= args.timestep_bias_multiplier
+
+    # Normalize
+    weights /= weights.sum()
+
+    return weights
+
+
 def main(args):
    logging_dir = Path(args.output_dir, args.logging_dir)

@@ -935,11 +1025,18 @@ def main(args):
                    )

                bsz = model_input.shape[0]
-                # Sample a random timestep for each image
-                timesteps = torch.randint(
-                    0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
-                )
-                timesteps = timesteps.long()
+                if args.timestep_bias_strategy == "none":
+                    # Sample a random timestep for each image without bias.
+                    timesteps = torch.randint(
+                        0, noise_scheduler.config.num_train_timesteps, (bsz,), device=model_input.device
+                    )
+                else:
+                    # Sample a random timestep for each image, potentially biased by the timestep weights.
+                    # Biasing the timestep weights allows us to spend less time training irrelevant timesteps.
+                    weights = generate_timestep_weights(args, noise_scheduler.config.num_train_timesteps).to(
+                        model_input.device
+                    )
+                    timesteps = torch.multinomial(weights, bsz, replacement=True).long()

                # Add noise to the model input according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
@@ -1003,6 +1100,11 @@ def main(args):
                        # Epsilon and sample both use the same loss weights.
                        mse_loss_weights = base_weight

+                    # For zero-terminal SNR, we have to handle the case where a sigma of Zero results in a Inf value.
+                    # When we run this, the MSE loss weights for this timestep is set unconditionally to 1.
+                    # If we do not run this, the loss value will go to NaN almost immediately, usually within one step.
+                    mse_loss_weights[snr == 0] = 1.0
+
                    # We first calculate the original loss. Then we mean over the non-batch dimensions and
                    # rebalance the sample-wise losses with their respective loss weights.
                    # Finally, we take the mean of the rebalanced loss.
@@ -128,7 +128,6 @@ _deps = [
    "torchvision",
    "transformers>=4.25.1",
    "urllib3<=2.0.0",
-    "peft>=0.5.0"
 ]

 # this is a lookup table with items like:
@@ -3,6 +3,7 @@ __version__ = "0.22.0.dev0"
 from typing import TYPE_CHECKING

 from .utils import (
+    DIFFUSERS_SLOW_IMPORT,
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_flax_available,
@@ -368,6 +369,7 @@ else:
            "FlaxDDIMScheduler",
            "FlaxDDPMScheduler",
            "FlaxDPMSolverMultistepScheduler",
+            "FlaxEulerDiscreteScheduler",
            "FlaxKarrasVeScheduler",
            "FlaxLMSDiscreteScheduler",
            "FlaxPNDMScheduler",
@@ -395,6 +397,7 @@ else:
            "FlaxStableDiffusionImg2ImgPipeline",
            "FlaxStableDiffusionInpaintPipeline",
            "FlaxStableDiffusionPipeline",
+            "FlaxStableDiffusionXLPipeline",
        ]
    )

@@ -412,7 +415,7 @@ except OptionalDependencyNotAvailable:
 else:
    _import_structure["pipelines"].extend(["MidiProcessor"])

-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    from .configuration_utils import ConfigMixin

    try:
@@ -673,6 +676,7 @@ if TYPE_CHECKING:
            FlaxDDIMScheduler,
            FlaxDDPMScheduler,
            FlaxDPMSolverMultistepScheduler,
+            FlaxEulerDiscreteScheduler,
            FlaxKarrasVeScheduler,
            FlaxLMSDiscreteScheduler,
            FlaxPNDMScheduler,
@@ -691,6 +695,7 @@ if TYPE_CHECKING:
            FlaxStableDiffusionImg2ImgPipeline,
            FlaxStableDiffusionInpaintPipeline,
            FlaxStableDiffusionPipeline,
+            FlaxStableDiffusionXLPipeline,
        )

    try:
@@ -41,5 +41,4 @@ deps = {
    "torchvision": "torchvision",
    "transformers": "transformers>=4.25.1",
    "urllib3": "urllib3<=2.0.0",
-    "peft": "peft>=0.5.0",
 }
@@ -16,7 +16,7 @@ import warnings
 from typing import List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from PIL import Image

@@ -48,7 +48,7 @@ class VaeImageProcessor(ConfigMixin):
            Resampling filter to use when resizing the image.
        do_normalize (`bool`, *optional*, defaults to `True`):
            Whether to normalize the image to [-1,1].
-        do_binarize (`bool`, *optional*, defaults to `True`):
+        do_binarize (`bool`, *optional*, defaults to `False`):
            Whether to binarize the image to 0/1.
        do_convert_rgb (`bool`, *optional*, defaults to be `False`):
            Whether to convert the images to RGB format.
@@ -35,17 +35,12 @@ from .utils import (
    convert_state_dict_to_diffusers,
    convert_state_dict_to_peft,
    deprecate,
-    get_adapter_name,
-    get_rank_and_alpha_pattern,
    is_accelerate_available,
    is_omegaconf_available,
    is_peft_available,
    is_transformers_available,
    logging,
    recurse_remove_peft_layers,
-    scale_lora_layers,
-    set_adapter_layers,
-    set_weights_and_activate_adapters,
 )
 from .utils.import_utils import BACKENDS_MAPPING

@@ -1105,9 +1100,7 @@ class LoraLoaderMixin:
    num_fused_loras = 0
    use_peft_backend = USE_PEFT_BACKEND

-    def load_lora_weights(
-        self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], adapter_name=None, **kwargs
-    ):
+    def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]], **kwargs):
        """
        Load LoRA weights specified in `pretrained_model_name_or_path_or_dict` into `self.unet` and
        `self.text_encoder`.
@@ -1151,7 +1144,6 @@ class LoraLoaderMixin:
            lora_scale=self.lora_scale,
            low_cpu_mem_usage=low_cpu_mem_usage,
            _pipeline=self,
-            adapter_name=adapter_name,
        )

    @classmethod
@@ -1298,6 +1290,7 @@ class LoraLoaderMixin:
            state_dict = pretrained_model_name_or_path_or_dict

        network_alphas = None
+        # TODO: replace it with a method from `state_dict_utils`
        if all(
            (
                k.startswith("lora_te_")
@@ -1508,7 +1501,6 @@ class LoraLoaderMixin:
        lora_scale=1.0,
        low_cpu_mem_usage=None,
        _pipeline=None,
-        adapter_name=None,
    ):
        """
        This will load the LoRA layers specified in `state_dict` into `text_encoder`
@@ -1531,9 +1523,6 @@ class LoraLoaderMixin:
                tries to not use more than 1x model size in CPU memory (including peak memory) while loading the model.
                Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
                argument to `True` will raise an error.
-            adapter_name (`str`, *optional*):
-                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
-                `default_{i}` where i is the total number of adapters being loaded
        """
        low_cpu_mem_usage = low_cpu_mem_usage if low_cpu_mem_usage is not None else _LOW_CPU_MEM_USAGE_DEFAULT

@@ -1562,15 +1551,15 @@ class LoraLoaderMixin:

                    for name, _ in text_encoder_attn_modules(text_encoder):
                        rank_key = f"{name}.out_proj.lora_B.weight"
-                        rank.update({rank_key: text_encoder_lora_state_dict[rank_key].shape[1]})
+                        rank[rank_key] = text_encoder_lora_state_dict[rank_key].shape[1]

                    patch_mlp = any(".mlp." in key for key in text_encoder_lora_state_dict.keys())
                    if patch_mlp:
                        for name, _ in text_encoder_mlp_modules(text_encoder):
                            rank_key_fc1 = f"{name}.fc1.lora_B.weight"
                            rank_key_fc2 = f"{name}.fc2.lora_B.weight"
-                            rank.update({rank_key_fc1: text_encoder_lora_state_dict[rank_key_fc1].shape[1]})
-                            rank.update({rank_key_fc2: text_encoder_lora_state_dict[rank_key_fc2].shape[1]})
+                            rank[rank_key_fc1] = text_encoder_lora_state_dict[rank_key_fc1].shape[1]
+                            rank[rank_key_fc2] = text_encoder_lora_state_dict[rank_key_fc2].shape[1]
                else:
                    for name, _ in text_encoder_attn_modules(text_encoder):
                        rank_key = f"{name}.out_proj.lora_linear_layer.up.weight"
@@ -1581,8 +1570,8 @@ class LoraLoaderMixin:
                        for name, _ in text_encoder_mlp_modules(text_encoder):
                            rank_key_fc1 = f"{name}.fc1.lora_linear_layer.up.weight"
                            rank_key_fc2 = f"{name}.fc2.lora_linear_layer.up.weight"
-                            rank.update({rank_key_fc1: text_encoder_lora_state_dict[rank_key_fc1].shape[1]})
-                            rank.update({rank_key_fc2: text_encoder_lora_state_dict[rank_key_fc2].shape[1]})
+                            rank[rank_key_fc1] = text_encoder_lora_state_dict[rank_key_fc1].shape[1]
+                            rank[rank_key_fc2] = text_encoder_lora_state_dict[rank_key_fc2].shape[1]

                if network_alphas is not None:
                    alpha_keys = [
@@ -1595,30 +1584,19 @@ class LoraLoaderMixin:
                if cls.use_peft_backend:
                    from peft import LoraConfig

-                    r, lora_alpha, rank_pattern, alpha_pattern, target_modules = get_rank_and_alpha_pattern(
-                        rank, network_alphas, text_encoder_lora_state_dict
-                    )
+                    lora_rank = list(rank.values())[0]
+                    # By definition, the scale should be alpha divided by rank.
+                    # https://github.com/huggingface/peft/blob/ba0477f2985b1ba311b83459d29895c809404e99/src/peft/tuners/lora/layer.py#L71
+                    alpha = lora_scale * lora_rank

-                    lora_config = LoraConfig(
-                        r=r,
-                        target_modules=target_modules,
-                        lora_alpha=lora_alpha,
-                        rank_pattern=rank_pattern,
-                        alpha_pattern=alpha_pattern,
-                    )
+                    target_modules = ["q_proj", "k_proj", "v_proj", "out_proj"]
+                    if patch_mlp:
+                        target_modules += ["fc1", "fc2"]

-                    # adapter_name
-                    if adapter_name is None:
-                        adapter_name = get_adapter_name(text_encoder)
+                    # TODO: support multi alpha / rank: https://github.com/huggingface/peft/pull/873
+                    lora_config = LoraConfig(r=lora_rank, target_modules=target_modules, lora_alpha=alpha)

-                    # inject LoRA layers and load the state dict
-                    text_encoder.load_adapter(
-                        adapter_name=adapter_name,
-                        adapter_state_dict=text_encoder_lora_state_dict,
-                        peft_config=lora_config,
-                    )
-                    # scale LoRA layers with `lora_scale`
-                    scale_lora_layers(text_encoder, lora_weightage=lora_scale)
+                    text_encoder.load_adapter(adapter_state_dict=text_encoder_lora_state_dict, peft_config=lora_config)

                    is_model_cpu_offload = False
                    is_sequential_cpu_offload = False
@@ -1700,8 +1678,15 @@ class LoraLoaderMixin:

        if hasattr(self, "text_encoder"):
            remove_method(self.text_encoder)
+
+            if self.use_peft_backend:
+                del self.text_encoder.peft_config
+                self.text_encoder._hf_peft_config_loaded = None
        if hasattr(self, "text_encoder_2"):
            remove_method(self.text_encoder_2)
+            if self.use_peft_backend:
+                del self.text_encoder_2.peft_config
+                self.text_encoder_2._hf_peft_config_loaded = None

    @classmethod
    def _remove_text_encoder_monkey_patch_classmethod(cls, text_encoder):
@@ -1937,7 +1922,7 @@ class LoraLoaderMixin:
                diffusers_name = diffusers_name.replace("emb.layers", "time_emb_proj")

                # SDXL specificity.
-                if "emb" in diffusers_name and "time" not in diffusers_name:
+                if "emb" in diffusers_name and "time.emb.proj" not in diffusers_name:
                    pattern = r"\.\d+(?=\D*$)"
                    diffusers_name = re.sub(pattern, "", diffusers_name, count=1)
                if ".in." in diffusers_name:
@@ -1950,7 +1935,7 @@ class LoraLoaderMixin:
                    diffusers_name = diffusers_name.replace("skip.connection", "conv_shortcut")

                # LyCORIS specificity.
-                if "time" in diffusers_name:
+                if "time.emb.proj" in diffusers_name:
                    diffusers_name = diffusers_name.replace("time.emb.proj", "time_emb_proj")
                if "conv.shortcut" in diffusers_name:
                    diffusers_name = diffusers_name.replace("conv.shortcut", "conv_shortcut")
@@ -2193,65 +2178,6 @@ class LoraLoaderMixin:

        self.num_fused_loras -= 1

-    def set_adapter(
-        self,
-        adapter_names: Union[List[str], str],
-        unet_weights: List[float] = None,
-        te_weights: List[float] = None,
-        te2_weights: List[float] = None,
-    ):
-        if not self.use_peft_backend:
-            raise ValueError("PEFT backend is required for this method.")
-
-        def process_weights(adapter_names, weights):
-            if weights is None:
-                weights = [1.0] * len(adapter_names)
-            elif isinstance(weights, float):
-                weights = [weights]
-
-            if len(adapter_names) != len(weights):
-                raise ValueError(
-                    f"Length of adapter names {len(adapter_names)} is not equal to the length of the weights {len(weights)}"
-                )
-            return weights
-
-        adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names
-
-        # To Do
-        # Handle the UNET
-
-        # Handle the Text Encoder
-        te_weights = process_weights(adapter_names, te_weights)
-        if hasattr(self, "text_encoder"):
-            set_weights_and_activate_adapters(self.text_encoder, adapter_names, te_weights)
-        te2_weights = process_weights(adapter_names, te2_weights)
-        if hasattr(self, "text_encoder_2"):
-            set_weights_and_activate_adapters(self.text_encoder_2, adapter_names, te2_weights)
-
-    def disable_lora(self):
-        if not self.use_peft_backend:
-            raise ValueError("PEFT backend is required for this method.")
-        # To Do
-        # Disbale unet adapters
-
-        # Disbale text encoder adapters
-        if hasattr(self, "text_encoder"):
-            set_adapter_layers(self.text_encoder, enabled=False)
-        if hasattr(self, "text_encoder_2"):
-            set_adapter_layers(self.text_encoder_2, enabled=False)
-
-    def enable_lora(self):
-        if not self.use_peft_backend:
-            raise ValueError("PEFT backend is required for this method.")
-        # To Do
-        # Enable unet adapters
-
-        # Enable text encoder adapters
-        if hasattr(self, "text_encoder"):
-            set_adapter_layers(self.text_encoder, enabled=True)
-        if hasattr(self, "text_encoder_2"):
-            set_adapter_layers(self.text_encoder_2, enabled=True)
-

 class FromSingleFileMixin:
    """
@@ -2955,7 +2881,14 @@ class StableDiffusionXLLoraLoaderMixin(LoraLoaderMixin):
    def _remove_text_encoder_monkey_patch(self):
        if self.use_peft_backend:
            recurse_remove_peft_layers(self.text_encoder)
+            # TODO: @younesbelkada handle this in transformers side
+            del self.text_encoder.peft_config
+            self.text_encoder._hf_peft_config_loaded = None
+
            recurse_remove_peft_layers(self.text_encoder_2)
+
+            del self.text_encoder_2.peft_config
+            self.text_encoder_2._hf_peft_config_loaded = None
        else:
            self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder)
            self._remove_text_encoder_monkey_patch_classmethod(self.text_encoder_2)
@@ -14,7 +14,7 @@

 from typing import TYPE_CHECKING

-from ..utils import _LazyModule, is_flax_available, is_torch_available
+from ..utils import DIFFUSERS_SLOW_IMPORT, _LazyModule, is_flax_available, is_torch_available


 _import_structure = {}
@@ -43,7 +43,7 @@ if is_flax_available():
    _import_structure["vae_flax"] = ["FlaxAutoencoderKL"]


-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    if is_torch_available():
        from .adapter import MultiAdapter, T2IAdapter
        from .autoencoder_asym_kl import AsymmetricAutoencoderKL
@@ -252,7 +252,10 @@ class T2IAdapter(ModelMixin, ConfigMixin):
        elif adapter_type == "light_adapter":
            self.adapter = LightAdapter(in_channels, channels, num_res_blocks, downscale_factor)
        else:
-            raise ValueError(f"unknown adapter_type: {type}. Choose either 'full_adapter' or 'simple_adapter'")
+            raise ValueError(
+                f"Unsupported adapter_type: '{adapter_type}'. Choose either 'full_adapter' or "
+                "'full_adapter_xl' or 'light_adapter'."
+            )

    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
        return self.adapter(x)
@@ -331,8 +334,8 @@ class FullAdapterXL(nn.Module):
                self.body.append(AdapterBlock(channels[i], channels[i], num_res_blocks))

        self.body = nn.ModuleList(self.body)
-        # XL has one fewer downsampling
-        self.total_downscale_factor = downscale_factor * 2 ** (len(channels) - 2)
+        # XL has only one downsampling AdapterBlock.
+        self.total_downscale_factor = downscale_factor * 2

    def forward(self, x: torch.Tensor) -> List[torch.Tensor]:
        x = self.unshuffle(x)
@@ -19,7 +19,7 @@ import torch.nn.functional as F
 from torch import nn

 from ..loaders import PatchedLoraProjection, text_encoder_attn_modules, text_encoder_mlp_modules
-from ..utils import logging, scale_lora_layers
+from ..utils import logging


 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -27,7 +27,11 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

 def adjust_lora_scale_text_encoder(text_encoder, lora_scale: float = 1.0, use_peft_backend: bool = False):
    if use_peft_backend:
-        scale_lora_layers(text_encoder, lora_weightage=lora_scale)
+        from peft.tuners.lora import LoraLayer
+
+        for module in text_encoder.modules():
+            if isinstance(module, LoraLayer):
+                module.scaling[module.active_adapter] = lora_scale
    else:
        for _, attn_module in text_encoder_attn_modules(text_encoder):
            if isinstance(attn_module.q_proj, PatchedLoraProjection):
@@ -42,9 +42,25 @@ def rename_key(key):
 # and https://github.com/patil-suraj/stable-diffusion-jax/blob/main/stable_diffusion_jax/convert_diffusers_to_jax.py
 def rename_key_and_reshape_tensor(pt_tuple_key, pt_tensor, random_flax_state_dict):
    """Rename PT weight names to corresponding Flax weight names and reshape tensor if necessary"""
-
    # conv norm or layer norm
    renamed_pt_tuple_key = pt_tuple_key[:-1] + ("scale",)
+
+    # rename attention layers
+    if len(pt_tuple_key) > 1:
+        for rename_from, rename_to in (
+            ("to_out_0", "proj_attn"),
+            ("to_k", "key"),
+            ("to_v", "value"),
+            ("to_q", "query"),
+        ):
+            if pt_tuple_key[-2] == rename_from:
+                weight_name = pt_tuple_key[-1]
+                weight_name = "kernel" if weight_name == "weight" else weight_name
+                renamed_pt_tuple_key = pt_tuple_key[:-2] + (rename_to, weight_name)
+                if renamed_pt_tuple_key in random_flax_state_dict:
+                    assert random_flax_state_dict[renamed_pt_tuple_key].shape == pt_tensor.T.shape
+                    return renamed_pt_tuple_key, pt_tensor.T
+
    if (
        any("norm" in str_ for str_ in pt_tuple_key)
        and (pt_tuple_key[-1] == "bias")
@@ -303,23 +303,23 @@ class FlaxModelMixin(PushToHubMixin):
            "framework": "flax",
        }

-        # Load config if we don't provide a configuration
-        config_path = config if config is not None else pretrained_model_name_or_path
-        model, model_kwargs = cls.from_config(
-            config_path,
-            cache_dir=cache_dir,
-            return_unused_kwargs=True,
-            force_download=force_download,
-            resume_download=resume_download,
-            proxies=proxies,
-            local_files_only=local_files_only,
-            use_auth_token=use_auth_token,
-            revision=revision,
-            subfolder=subfolder,
-            # model args
-            dtype=dtype,
-            **kwargs,
-        )
+        # Load config if we don't provide one
+        if config is None:
+            config, unused_kwargs = cls.load_config(
+                pretrained_model_name_or_path,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
+                force_download=force_download,
+                resume_download=resume_download,
+                proxies=proxies,
+                local_files_only=local_files_only,
+                use_auth_token=use_auth_token,
+                revision=revision,
+                subfolder=subfolder,
+                **kwargs,
+            )
+
+        model, model_kwargs = cls.from_config(config, dtype=dtype, return_unused_kwargs=True, **unused_kwargs)

        # Load model
        pretrained_path_with_subfolder = (
@@ -52,6 +52,7 @@ class FlaxCrossAttnDownBlock2D(nn.Module):
    only_cross_attention: bool = False
    use_memory_efficient_attention: bool = False
    dtype: jnp.dtype = jnp.float32
+    transformer_layers_per_block: int = 1

    def setup(self):
        resnets = []
@@ -72,7 +73,7 @@ class FlaxCrossAttnDownBlock2D(nn.Module):
                in_channels=self.out_channels,
                n_heads=self.num_attention_heads,
                d_head=self.out_channels // self.num_attention_heads,
-                depth=1,
+                depth=self.transformer_layers_per_block,
                use_linear_projection=self.use_linear_projection,
                only_cross_attention=self.only_cross_attention,
                use_memory_efficient_attention=self.use_memory_efficient_attention,
@@ -192,6 +193,7 @@ class FlaxCrossAttnUpBlock2D(nn.Module):
    only_cross_attention: bool = False
    use_memory_efficient_attention: bool = False
    dtype: jnp.dtype = jnp.float32
+    transformer_layers_per_block: int = 1

    def setup(self):
        resnets = []
@@ -213,7 +215,7 @@ class FlaxCrossAttnUpBlock2D(nn.Module):
                in_channels=self.out_channels,
                n_heads=self.num_attention_heads,
                d_head=self.out_channels // self.num_attention_heads,
-                depth=1,
+                depth=self.transformer_layers_per_block,
                use_linear_projection=self.use_linear_projection,
                only_cross_attention=self.only_cross_attention,
                use_memory_efficient_attention=self.use_memory_efficient_attention,
@@ -331,6 +333,7 @@ class FlaxUNetMidBlock2DCrossAttn(nn.Module):
    use_linear_projection: bool = False
    use_memory_efficient_attention: bool = False
    dtype: jnp.dtype = jnp.float32
+    transformer_layers_per_block: int = 1

    def setup(self):
        # there is always at least one resnet
@@ -350,7 +353,7 @@ class FlaxUNetMidBlock2DCrossAttn(nn.Module):
                in_channels=self.in_channels,
                n_heads=self.num_attention_heads,
                d_head=self.in_channels // self.num_attention_heads,
-                depth=1,
+                depth=self.transformer_layers_per_block,
                use_linear_projection=self.use_linear_projection,
                use_memory_efficient_attention=self.use_memory_efficient_attention,
                dtype=self.dtype,
@@ -883,7 +883,6 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
            time_ids = added_cond_kwargs.get("time_ids")
            time_embeds = self.add_time_proj(time_ids.flatten())
            time_embeds = time_embeds.reshape((text_embeds.shape[0], -1))
-
            add_embeds = torch.concat([text_embeds, time_embeds], dim=-1)
            add_embeds = add_embeds.to(emb.dtype)
            aug_emb = self.add_embedding(add_embeds)
@@ -946,6 +945,9 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin)
        is_adapter = mid_block_additional_residual is None and down_block_additional_residuals is not None

        down_block_res_samples = (sample,)
+        print("emb", emb.abs().sum())
+        print("sample", sample.abs().sum())
+
        for downsample_block in self.down_blocks:
            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
                # For t2i-adapter CrossAttnDownBlock2D
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Optional, Tuple, Union
+from typing import Dict, Optional, Tuple, Union

 import flax
 import flax.linen as nn
@@ -116,6 +116,11 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
    flip_sin_to_cos: bool = True
    freq_shift: int = 0
    use_memory_efficient_attention: bool = False
+    transformer_layers_per_block: Union[int, Tuple[int]] = 1
+    addition_embed_type: Optional[str] = None
+    addition_time_embed_dim: Optional[int] = None
+    addition_embed_type_num_heads: int = 64
+    projection_class_embeddings_input_dim: Optional[int] = None

    def init_weights(self, rng: jax.random.KeyArray) -> FrozenDict:
        # init input tensors
@@ -127,7 +132,27 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
        params_rng, dropout_rng = jax.random.split(rng)
        rngs = {"params": params_rng, "dropout": dropout_rng}

-        return self.init(rngs, sample, timesteps, encoder_hidden_states)["params"]
+        added_cond_kwargs = None
+        if self.addition_embed_type == "text_time":
+            # we retrieve the expected `text_embeds_dim` by first checking if the architecture is a refiner
+            # or non-refiner architecture and then by "reverse-computing" from `projection_class_embeddings_input_dim`
+            is_refiner = (
+                5 * self.config.addition_time_embed_dim + self.config.cross_attention_dim
+                == self.config.projection_class_embeddings_input_dim
+            )
+            num_micro_conditions = 5 if is_refiner else 6
+
+            text_embeds_dim = self.config.projection_class_embeddings_input_dim - (
+                num_micro_conditions * self.config.addition_time_embed_dim
+            )
+
+            time_ids_channels = self.projection_class_embeddings_input_dim - text_embeds_dim
+            time_ids_dims = time_ids_channels // self.addition_time_embed_dim
+            added_cond_kwargs = {
+                "text_embeds": jnp.zeros((1, text_embeds_dim), dtype=jnp.float32),
+                "time_ids": jnp.zeros((1, time_ids_dims), dtype=jnp.float32),
+            }
+        return self.init(rngs, sample, timesteps, encoder_hidden_states, added_cond_kwargs)["params"]

    def setup(self):
        block_out_channels = self.block_out_channels
@@ -168,6 +193,24 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
        if isinstance(num_attention_heads, int):
            num_attention_heads = (num_attention_heads,) * len(self.down_block_types)

+        # transformer layers per block
+        transformer_layers_per_block = self.transformer_layers_per_block
+        if isinstance(transformer_layers_per_block, int):
+            transformer_layers_per_block = [transformer_layers_per_block] * len(self.down_block_types)
+
+        # addition embed types
+        if self.addition_embed_type is None:
+            self.add_embedding = None
+        elif self.addition_embed_type == "text_time":
+            if self.addition_time_embed_dim is None:
+                raise ValueError(
+                    f"addition_embed_type {self.addition_embed_type} requires `addition_time_embed_dim` to not be None"
+                )
+            self.add_time_proj = FlaxTimesteps(self.addition_time_embed_dim, self.flip_sin_to_cos, self.freq_shift)
+            self.add_embedding = FlaxTimestepEmbedding(time_embed_dim, dtype=self.dtype)
+        else:
+            raise ValueError(f"addition_embed_type: {self.addition_embed_type} must be None or `text_time`.")
+
        # down
        down_blocks = []
        output_channel = block_out_channels[0]
@@ -182,6 +225,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
                    out_channels=output_channel,
                    dropout=self.dropout,
                    num_layers=self.layers_per_block,
+                    transformer_layers_per_block=transformer_layers_per_block[i],
                    num_attention_heads=num_attention_heads[i],
                    add_downsample=not is_final_block,
                    use_linear_projection=self.use_linear_projection,
@@ -207,6 +251,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
            in_channels=block_out_channels[-1],
            dropout=self.dropout,
            num_attention_heads=num_attention_heads[-1],
+            transformer_layers_per_block=transformer_layers_per_block[-1],
            use_linear_projection=self.use_linear_projection,
            use_memory_efficient_attention=self.use_memory_efficient_attention,
            dtype=self.dtype,
@@ -218,6 +263,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
        reversed_num_attention_heads = list(reversed(num_attention_heads))
        only_cross_attention = list(reversed(only_cross_attention))
        output_channel = reversed_block_out_channels[0]
+        reversed_transformer_layers_per_block = list(reversed(transformer_layers_per_block))
        for i, up_block_type in enumerate(self.up_block_types):
            prev_output_channel = output_channel
            output_channel = reversed_block_out_channels[i]
@@ -231,6 +277,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
                    out_channels=output_channel,
                    prev_output_channel=prev_output_channel,
                    num_layers=self.layers_per_block + 1,
+                    transformer_layers_per_block=reversed_transformer_layers_per_block[i],
                    num_attention_heads=reversed_num_attention_heads[i],
                    add_upsample=not is_final_block,
                    dropout=self.dropout,
@@ -269,6 +316,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
        sample,
        timesteps,
        encoder_hidden_states,
+        added_cond_kwargs: Optional[Union[Dict, FrozenDict]] = None,
        down_block_additional_residuals=None,
        mid_block_additional_residual=None,
        return_dict: bool = True,
@@ -300,10 +348,40 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
        t_emb = self.time_proj(timesteps)
        t_emb = self.time_embedding(t_emb)

+        # additional embeddings
+        aug_emb = None
+        if self.addition_embed_type == "text_time":
+            if added_cond_kwargs is None:
+                raise ValueError(
+                    f"Need to provide argument `added_cond_kwargs` for {self.__class__} when using `addition_embed_type={self.addition_embed_type}`"
+                )
+            text_embeds = added_cond_kwargs.get("text_embeds")
+            if text_embeds is None:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`"
+                )
+            time_ids = added_cond_kwargs.get("time_ids")
+            if time_ids is None:
+                raise ValueError(
+                    f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`"
+                )
+            # compute time embeds
+            time_embeds = self.add_time_proj(jnp.ravel(time_ids))  # (1, 6) => (6,) => (6, 256)
+            time_embeds = jnp.reshape(time_embeds, (text_embeds.shape[0], -1))
+            add_embeds = jnp.concatenate([text_embeds, time_embeds], axis=-1)
+            aug_emb = self.add_embedding(add_embeds)
+
+        t_emb = t_emb + aug_emb if aug_emb is not None else t_emb
+
        # 2. pre-process
        sample = jnp.transpose(sample, (0, 2, 3, 1))
        sample = self.conv_in(sample)

+        if not isinstance(t_emb, jax._src.interpreters.partial_eval.DynamicJaxprTracer):
+            import torch; import numpy as np
+            print("t_emb", torch.from_numpy(np.asarray(t_emb)).abs().sum())
+            print("sample", torch.from_numpy(np.asarray(sample)).abs().sum())
+
        # 3. down
        down_block_res_samples = (sample,)
        for down_block in self.down_blocks:
@@ -1,460 +1,472 @@
-from typing import TYPE_CHECKING
-
-from ..utils import (
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    get_objects_from_module,
-    is_flax_available,
-    is_k_diffusion_available,
-    is_librosa_available,
-    is_note_seq_available,
-    is_onnx_available,
-    is_torch_available,
-    is_transformers_available,
-)
-
-
-# These modules contain pipelines from multiple libraries/frameworks
-_dummy_objects = {}
-_import_structure = {"stable_diffusion": [], "latent_diffusion": [], "controlnet": []}
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils import dummy_pt_objects  # noqa F403
-
-    _dummy_objects.update(get_objects_from_module(dummy_pt_objects))
-else:
-    _import_structure["auto_pipeline"] = [
-        "AutoPipelineForImage2Image",
-        "AutoPipelineForInpainting",
-        "AutoPipelineForText2Image",
-    ]
-    _import_structure["consistency_models"] = ["ConsistencyModelPipeline"]
-    _import_structure["dance_diffusion"] = ["DanceDiffusionPipeline"]
-    _import_structure["ddim"] = ["DDIMPipeline"]
-    _import_structure["ddpm"] = ["DDPMPipeline"]
-    _import_structure["dit"] = ["DiTPipeline"]
-    _import_structure["latent_diffusion"].extend(["LDMSuperResolutionPipeline"])
-    _import_structure["latent_diffusion_uncond"] = ["LDMPipeline"]
-    _import_structure["pipeline_utils"] = ["AudioPipelineOutput", "DiffusionPipeline", "ImagePipelineOutput"]
-    _import_structure["pndm"] = ["PNDMPipeline"]
-    _import_structure["repaint"] = ["RePaintPipeline"]
-    _import_structure["score_sde_ve"] = ["ScoreSdeVePipeline"]
-    _import_structure["stochastic_karras_ve"] = ["KarrasVePipeline"]
-try:
-    if not (is_torch_available() and is_librosa_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils import dummy_torch_and_librosa_objects  # noqa F403
-
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_librosa_objects))
-else:
-    _import_structure["audio_diffusion"] = ["AudioDiffusionPipeline", "Mel"]
-try:
-    if not (is_torch_available() and is_transformers_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils import dummy_torch_and_transformers_objects  # noqa F403
-
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-else:
-    _import_structure["alt_diffusion"] = ["AltDiffusionImg2ImgPipeline", "AltDiffusionPipeline"]
-    _import_structure["audioldm"] = ["AudioLDMPipeline"]
-    _import_structure["audioldm2"] = [
-        "AudioLDM2Pipeline",
-        "AudioLDM2ProjectionModel",
-        "AudioLDM2UNet2DConditionModel",
-    ]
-    _import_structure["blip_diffusion"] = ["BlipDiffusionPipeline"]
-    _import_structure["controlnet"].extend(
-        [
-            "BlipDiffusionControlNetPipeline",
-            "StableDiffusionControlNetImg2ImgPipeline",
-            "StableDiffusionControlNetInpaintPipeline",
-            "StableDiffusionControlNetPipeline",
-            "StableDiffusionXLControlNetImg2ImgPipeline",
-            "StableDiffusionXLControlNetInpaintPipeline",
-            "StableDiffusionXLControlNetPipeline",
-        ]
-    )
-    _import_structure["deepfloyd_if"] = [
-        "IFImg2ImgPipeline",
-        "IFImg2ImgSuperResolutionPipeline",
-        "IFInpaintingPipeline",
-        "IFInpaintingSuperResolutionPipeline",
-        "IFPipeline",
-        "IFSuperResolutionPipeline",
-    ]
-    _import_structure["kandinsky"] = [
-        "KandinskyCombinedPipeline",
-        "KandinskyImg2ImgCombinedPipeline",
-        "KandinskyImg2ImgPipeline",
-        "KandinskyInpaintCombinedPipeline",
-        "KandinskyInpaintPipeline",
-        "KandinskyPipeline",
-        "KandinskyPriorPipeline",
-    ]
-    _import_structure["kandinsky2_2"] = [
-        "KandinskyV22CombinedPipeline",
-        "KandinskyV22ControlnetImg2ImgPipeline",
-        "KandinskyV22ControlnetPipeline",
-        "KandinskyV22Img2ImgCombinedPipeline",
-        "KandinskyV22Img2ImgPipeline",
-        "KandinskyV22InpaintCombinedPipeline",
-        "KandinskyV22InpaintPipeline",
-        "KandinskyV22Pipeline",
-        "KandinskyV22PriorEmb2EmbPipeline",
-        "KandinskyV22PriorPipeline",
-    ]
-    _import_structure["latent_diffusion"].extend(["LDMTextToImagePipeline"])
-    _import_structure["musicldm"] = ["MusicLDMPipeline"]
-    _import_structure["paint_by_example"] = ["PaintByExamplePipeline"]
-    _import_structure["semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"]
-    _import_structure["shap_e"] = ["ShapEImg2ImgPipeline", "ShapEPipeline"]
-    _import_structure["stable_diffusion"].extend(
-        [
-            "CLIPImageProjection",
-            "CycleDiffusionPipeline",
-            "StableDiffusionAttendAndExcitePipeline",
-            "StableDiffusionDepth2ImgPipeline",
-            "StableDiffusionDiffEditPipeline",
-            "StableDiffusionGLIGENPipeline",
-            "StableDiffusionGLIGENPipeline",
-            "StableDiffusionGLIGENTextImagePipeline",
-            "StableDiffusionImageVariationPipeline",
-            "StableDiffusionImg2ImgPipeline",
-            "StableDiffusionInpaintPipeline",
-            "StableDiffusionInpaintPipelineLegacy",
-            "StableDiffusionInstructPix2PixPipeline",
-            "StableDiffusionLatentUpscalePipeline",
-            "StableDiffusionLDM3DPipeline",
-            "StableDiffusionModelEditingPipeline",
-            "StableDiffusionPanoramaPipeline",
-            "StableDiffusionParadigmsPipeline",
-            "StableDiffusionPipeline",
-            "StableDiffusionPix2PixZeroPipeline",
-            "StableDiffusionSAGPipeline",
-            "StableDiffusionUpscalePipeline",
-            "StableUnCLIPImg2ImgPipeline",
-            "StableUnCLIPPipeline",
-        ]
-    )
-    _import_structure["stable_diffusion_safe"] = ["StableDiffusionPipelineSafe"]
-    _import_structure["stable_diffusion_xl"] = [
-        "StableDiffusionXLImg2ImgPipeline",
-        "StableDiffusionXLInpaintPipeline",
-        "StableDiffusionXLInstructPix2PixPipeline",
-        "StableDiffusionXLPipeline",
-    ]
-    _import_structure["t2i_adapter"] = ["StableDiffusionAdapterPipeline", "StableDiffusionXLAdapterPipeline"]
-    _import_structure["text_to_video_synthesis"] = [
-        "TextToVideoSDPipeline",
-        "TextToVideoZeroPipeline",
-        "VideoToVideoSDPipeline",
-    ]
-    _import_structure["unclip"] = ["UnCLIPImageVariationPipeline", "UnCLIPPipeline"]
-    _import_structure["unidiffuser"] = [
-        "ImageTextPipelineOutput",
-        "UniDiffuserModel",
-        "UniDiffuserPipeline",
-        "UniDiffuserTextDecoder",
-    ]
-    _import_structure["versatile_diffusion"] = [
-        "VersatileDiffusionDualGuidedPipeline",
-        "VersatileDiffusionImageVariationPipeline",
-        "VersatileDiffusionPipeline",
-        "VersatileDiffusionTextToImagePipeline",
-    ]
-    _import_structure["vq_diffusion"] = ["VQDiffusionPipeline"]
-    _import_structure["wuerstchen"] = [
-        "WuerstchenCombinedPipeline",
-        "WuerstchenDecoderPipeline",
-        "WuerstchenPriorPipeline",
-    ]
-try:
-    if not is_onnx_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils import dummy_onnx_objects  # noqa F403
-
-    _dummy_objects.update(get_objects_from_module(dummy_onnx_objects))
-else:
-    _import_structure["onnx_utils"] = ["OnnxRuntimeModel"]
-try:
-    if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils import dummy_torch_and_transformers_and_onnx_objects  # noqa F403
-
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_onnx_objects))
-else:
-    _import_structure["stable_diffusion"].extend(
-        [
-            "OnnxStableDiffusionImg2ImgPipeline",
-            "OnnxStableDiffusionInpaintPipeline",
-            "OnnxStableDiffusionInpaintPipelineLegacy",
-            "OnnxStableDiffusionPipeline",
-            "OnnxStableDiffusionUpscalePipeline",
-            "StableDiffusionOnnxPipeline",
-        ]
-    )
-try:
-    if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils import dummy_torch_and_transformers_and_k_diffusion_objects  # noqa F403
-
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_k_diffusion_objects))
-else:
-    _import_structure["stable_diffusion"].extend(["StableDiffusionKDiffusionPipeline"])
-try:
-    if not is_flax_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils import dummy_flax_objects  # noqa F403
-
-    _dummy_objects.update(get_objects_from_module(dummy_flax_objects))
-else:
-    _import_structure["pipeline_flax_utils"] = ["FlaxDiffusionPipeline"]
-try:
-    if not (is_flax_available() and is_transformers_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils import dummy_flax_and_transformers_objects  # noqa F403
-
-    _dummy_objects.update(get_objects_from_module(dummy_flax_and_transformers_objects))
-else:
-    _import_structure["controlnet"].extend(["FlaxStableDiffusionControlNetPipeline"])
-    _import_structure["stable_diffusion"].extend(
-        [
-            "FlaxStableDiffusionImg2ImgPipeline",
-            "FlaxStableDiffusionInpaintPipeline",
-            "FlaxStableDiffusionPipeline",
-        ]
-    )
-try:
-    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
-
-    _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
-else:
-    _import_structure["spectrogram_diffusion"] = ["MidiProcessor", "SpectrogramDiffusionPipeline"]
-
-if TYPE_CHECKING:
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        from ..utils.dummy_pt_objects import *  # noqa F403
-
-    else:
-        from .auto_pipeline import AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image
-        from .consistency_models import ConsistencyModelPipeline
-        from .dance_diffusion import DanceDiffusionPipeline
-        from .ddim import DDIMPipeline
-        from .ddpm import DDPMPipeline
-        from .dit import DiTPipeline
-        from .latent_diffusion import LDMSuperResolutionPipeline
-        from .latent_diffusion_uncond import LDMPipeline
-        from .pipeline_utils import AudioPipelineOutput, DiffusionPipeline, ImagePipelineOutput
-        from .pndm import PNDMPipeline
-        from .repaint import RePaintPipeline
-        from .score_sde_ve import ScoreSdeVePipeline
-        from .stochastic_karras_ve import KarrasVePipeline
-
-    try:
-        if not (is_torch_available() and is_librosa_available()):
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        from ..utils.dummy_torch_and_librosa_objects import *
-    else:
-        from .audio_diffusion import AudioDiffusionPipeline, Mel
-
-    try:
-        if not (is_torch_available() and is_transformers_available()):
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        from ..utils.dummy_torch_and_transformers_objects import *
-    else:
-        from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline
-        from .audioldm import AudioLDMPipeline
-        from .audioldm2 import AudioLDM2Pipeline, AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
-        from .blip_diffusion import BlipDiffusionPipeline
-        from .controlnet import (
-            BlipDiffusionControlNetPipeline,
-            StableDiffusionControlNetImg2ImgPipeline,
-            StableDiffusionControlNetInpaintPipeline,
-            StableDiffusionControlNetPipeline,
-            StableDiffusionXLControlNetImg2ImgPipeline,
-            StableDiffusionXLControlNetInpaintPipeline,
-            StableDiffusionXLControlNetPipeline,
-        )
-        from .deepfloyd_if import (
-            IFImg2ImgPipeline,
-            IFImg2ImgSuperResolutionPipeline,
-            IFInpaintingPipeline,
-            IFInpaintingSuperResolutionPipeline,
-            IFPipeline,
-            IFSuperResolutionPipeline,
-        )
-        from .kandinsky import (
-            KandinskyCombinedPipeline,
-            KandinskyImg2ImgCombinedPipeline,
-            KandinskyImg2ImgPipeline,
-            KandinskyInpaintCombinedPipeline,
-            KandinskyInpaintPipeline,
-            KandinskyPipeline,
-            KandinskyPriorPipeline,
-        )
-        from .kandinsky2_2 import (
-            KandinskyV22CombinedPipeline,
-            KandinskyV22ControlnetImg2ImgPipeline,
-            KandinskyV22ControlnetPipeline,
-            KandinskyV22Img2ImgCombinedPipeline,
-            KandinskyV22Img2ImgPipeline,
-            KandinskyV22InpaintCombinedPipeline,
-            KandinskyV22InpaintPipeline,
-            KandinskyV22Pipeline,
-            KandinskyV22PriorEmb2EmbPipeline,
-            KandinskyV22PriorPipeline,
-        )
-        from .latent_diffusion import LDMTextToImagePipeline
-        from .musicldm import MusicLDMPipeline
-        from .paint_by_example import PaintByExamplePipeline
-        from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
-        from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
-        from .stable_diffusion import (
-            CLIPImageProjection,
-            CycleDiffusionPipeline,
-            StableDiffusionAttendAndExcitePipeline,
-            StableDiffusionDepth2ImgPipeline,
-            StableDiffusionDiffEditPipeline,
-            StableDiffusionGLIGENPipeline,
-            StableDiffusionGLIGENTextImagePipeline,
-            StableDiffusionImageVariationPipeline,
-            StableDiffusionImg2ImgPipeline,
-            StableDiffusionInpaintPipeline,
-            StableDiffusionInpaintPipelineLegacy,
-            StableDiffusionInstructPix2PixPipeline,
-            StableDiffusionLatentUpscalePipeline,
-            StableDiffusionLDM3DPipeline,
-            StableDiffusionModelEditingPipeline,
-            StableDiffusionPanoramaPipeline,
-            StableDiffusionParadigmsPipeline,
-            StableDiffusionPipeline,
-            StableDiffusionPix2PixZeroPipeline,
-            StableDiffusionSAGPipeline,
-            StableDiffusionUpscalePipeline,
-            StableUnCLIPImg2ImgPipeline,
-            StableUnCLIPPipeline,
-        )
-        from .stable_diffusion_safe import StableDiffusionPipelineSafe
-        from .stable_diffusion_xl import (
-            StableDiffusionXLImg2ImgPipeline,
-            StableDiffusionXLInpaintPipeline,
-            StableDiffusionXLInstructPix2PixPipeline,
-            StableDiffusionXLPipeline,
-        )
-        from .t2i_adapter import StableDiffusionAdapterPipeline, StableDiffusionXLAdapterPipeline
-        from .text_to_video_synthesis import (
-            TextToVideoSDPipeline,
-            TextToVideoZeroPipeline,
-            VideoToVideoSDPipeline,
-        )
-        from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline
-        from .unidiffuser import (
-            ImageTextPipelineOutput,
-            UniDiffuserModel,
-            UniDiffuserPipeline,
-            UniDiffuserTextDecoder,
-        )
-        from .versatile_diffusion import (
-            VersatileDiffusionDualGuidedPipeline,
-            VersatileDiffusionImageVariationPipeline,
-            VersatileDiffusionPipeline,
-            VersatileDiffusionTextToImagePipeline,
-        )
-        from .vq_diffusion import VQDiffusionPipeline
-        from .wuerstchen import (
-            WuerstchenCombinedPipeline,
-            WuerstchenDecoderPipeline,
-            WuerstchenPriorPipeline,
-        )
-
-        try:
-            if not is_onnx_available():
-                raise OptionalDependencyNotAvailable()
-        except OptionalDependencyNotAvailable:
-            from ..utils.dummy_onnx_objects import *  # noqa F403
-
-        else:
-            from .onnx_utils import OnnxRuntimeModel
-
-        try:
-            if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
-                raise OptionalDependencyNotAvailable()
-        except OptionalDependencyNotAvailable:
-            from ..utils.dummy_torch_and_transformers_and_onnx_objects import *
-        else:
-            from .stable_diffusion import (
-                OnnxStableDiffusionImg2ImgPipeline,
-                OnnxStableDiffusionInpaintPipeline,
-                OnnxStableDiffusionInpaintPipelineLegacy,
-                OnnxStableDiffusionPipeline,
-                OnnxStableDiffusionUpscalePipeline,
-                StableDiffusionOnnxPipeline,
-            )
-
-        try:
-            if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
-                raise OptionalDependencyNotAvailable()
-        except OptionalDependencyNotAvailable:
-            from ..utils.dummy_torch_and_transformers_and_k_diffusion_objects import *
-        else:
-            from .stable_diffusion import StableDiffusionKDiffusionPipeline
-
-        try:
-            if not is_flax_available():
-                raise OptionalDependencyNotAvailable()
-        except OptionalDependencyNotAvailable:
-            from ..utils.dummy_flax_objects import *  # noqa F403
-        else:
-            from .pipeline_flax_utils import FlaxDiffusionPipeline
-
-        try:
-            if not (is_flax_available() and is_transformers_available()):
-                raise OptionalDependencyNotAvailable()
-        except OptionalDependencyNotAvailable:
-            from ..utils.dummy_flax_and_transformers_objects import *
-        else:
-            from .controlnet import FlaxStableDiffusionControlNetPipeline
-            from .stable_diffusion import (
-                FlaxStableDiffusionImg2ImgPipeline,
-                FlaxStableDiffusionInpaintPipeline,
-                FlaxStableDiffusionPipeline,
-            )
-
-        try:
-            if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
-                raise OptionalDependencyNotAvailable()
-        except OptionalDependencyNotAvailable:
-            from ..utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
-
-        else:
-            from .spectrogram_diffusion import MidiProcessor, SpectrogramDiffusionPipeline
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(
-        __name__,
-        globals()["__file__"],
-        _import_structure,
-        module_spec=__spec__,
-    )
-    for name, value in _dummy_objects.items():
-        setattr(sys.modules[__name__], name, value)
+from typing import TYPE_CHECKING
+
+from ..utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_flax_available,
+    is_k_diffusion_available,
+    is_librosa_available,
+    is_note_seq_available,
+    is_onnx_available,
+    is_torch_available,
+    is_transformers_available,
+)
+
+
+# These modules contain pipelines from multiple libraries/frameworks
+_dummy_objects = {}
+_import_structure = {"stable_diffusion": [], "stable_diffusion_xl": [], "latent_diffusion": [], "controlnet": []}
+
+try:
+    if not is_torch_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_pt_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_pt_objects))
+else:
+    _import_structure["auto_pipeline"] = [
+        "AutoPipelineForImage2Image",
+        "AutoPipelineForInpainting",
+        "AutoPipelineForText2Image",
+    ]
+    _import_structure["consistency_models"] = ["ConsistencyModelPipeline"]
+    _import_structure["dance_diffusion"] = ["DanceDiffusionPipeline"]
+    _import_structure["ddim"] = ["DDIMPipeline"]
+    _import_structure["ddpm"] = ["DDPMPipeline"]
+    _import_structure["dit"] = ["DiTPipeline"]
+    _import_structure["latent_diffusion"].extend(["LDMSuperResolutionPipeline"])
+    _import_structure["latent_diffusion_uncond"] = ["LDMPipeline"]
+    _import_structure["pipeline_utils"] = ["AudioPipelineOutput", "DiffusionPipeline", "ImagePipelineOutput"]
+    _import_structure["pndm"] = ["PNDMPipeline"]
+    _import_structure["repaint"] = ["RePaintPipeline"]
+    _import_structure["score_sde_ve"] = ["ScoreSdeVePipeline"]
+    _import_structure["stochastic_karras_ve"] = ["KarrasVePipeline"]
+try:
+    if not (is_torch_available() and is_librosa_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_torch_and_librosa_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_librosa_objects))
+else:
+    _import_structure["audio_diffusion"] = ["AudioDiffusionPipeline", "Mel"]
+try:
+    if not (is_torch_available() and is_transformers_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_torch_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["alt_diffusion"] = ["AltDiffusionImg2ImgPipeline", "AltDiffusionPipeline"]
+    _import_structure["audioldm"] = ["AudioLDMPipeline"]
+    _import_structure["audioldm2"] = [
+        "AudioLDM2Pipeline",
+        "AudioLDM2ProjectionModel",
+        "AudioLDM2UNet2DConditionModel",
+    ]
+    _import_structure["blip_diffusion"] = ["BlipDiffusionPipeline"]
+    _import_structure["controlnet"].extend(
+        [
+            "BlipDiffusionControlNetPipeline",
+            "StableDiffusionControlNetImg2ImgPipeline",
+            "StableDiffusionControlNetInpaintPipeline",
+            "StableDiffusionControlNetPipeline",
+            "StableDiffusionXLControlNetImg2ImgPipeline",
+            "StableDiffusionXLControlNetInpaintPipeline",
+            "StableDiffusionXLControlNetPipeline",
+        ]
+    )
+    _import_structure["deepfloyd_if"] = [
+        "IFImg2ImgPipeline",
+        "IFImg2ImgSuperResolutionPipeline",
+        "IFInpaintingPipeline",
+        "IFInpaintingSuperResolutionPipeline",
+        "IFPipeline",
+        "IFSuperResolutionPipeline",
+    ]
+    _import_structure["kandinsky"] = [
+        "KandinskyCombinedPipeline",
+        "KandinskyImg2ImgCombinedPipeline",
+        "KandinskyImg2ImgPipeline",
+        "KandinskyInpaintCombinedPipeline",
+        "KandinskyInpaintPipeline",
+        "KandinskyPipeline",
+        "KandinskyPriorPipeline",
+    ]
+    _import_structure["kandinsky2_2"] = [
+        "KandinskyV22CombinedPipeline",
+        "KandinskyV22ControlnetImg2ImgPipeline",
+        "KandinskyV22ControlnetPipeline",
+        "KandinskyV22Img2ImgCombinedPipeline",
+        "KandinskyV22Img2ImgPipeline",
+        "KandinskyV22InpaintCombinedPipeline",
+        "KandinskyV22InpaintPipeline",
+        "KandinskyV22Pipeline",
+        "KandinskyV22PriorEmb2EmbPipeline",
+        "KandinskyV22PriorPipeline",
+    ]
+    _import_structure["latent_diffusion"].extend(["LDMTextToImagePipeline"])
+    _import_structure["musicldm"] = ["MusicLDMPipeline"]
+    _import_structure["paint_by_example"] = ["PaintByExamplePipeline"]
+    _import_structure["semantic_stable_diffusion"] = ["SemanticStableDiffusionPipeline"]
+    _import_structure["shap_e"] = ["ShapEImg2ImgPipeline", "ShapEPipeline"]
+    _import_structure["stable_diffusion"].extend(
+        [
+            "CLIPImageProjection",
+            "CycleDiffusionPipeline",
+            "StableDiffusionAttendAndExcitePipeline",
+            "StableDiffusionDepth2ImgPipeline",
+            "StableDiffusionDiffEditPipeline",
+            "StableDiffusionGLIGENPipeline",
+            "StableDiffusionGLIGENPipeline",
+            "StableDiffusionGLIGENTextImagePipeline",
+            "StableDiffusionImageVariationPipeline",
+            "StableDiffusionImg2ImgPipeline",
+            "StableDiffusionInpaintPipeline",
+            "StableDiffusionInpaintPipelineLegacy",
+            "StableDiffusionInstructPix2PixPipeline",
+            "StableDiffusionLatentUpscalePipeline",
+            "StableDiffusionLDM3DPipeline",
+            "StableDiffusionModelEditingPipeline",
+            "StableDiffusionPanoramaPipeline",
+            "StableDiffusionParadigmsPipeline",
+            "StableDiffusionPipeline",
+            "StableDiffusionPix2PixZeroPipeline",
+            "StableDiffusionSAGPipeline",
+            "StableDiffusionUpscalePipeline",
+            "StableUnCLIPImg2ImgPipeline",
+            "StableUnCLIPPipeline",
+        ]
+    )
+    _import_structure["stable_diffusion_safe"] = ["StableDiffusionPipelineSafe"]
+    _import_structure["stable_diffusion_xl"].extend(
+        [
+            "StableDiffusionXLImg2ImgPipeline",
+            "StableDiffusionXLInpaintPipeline",
+            "StableDiffusionXLInstructPix2PixPipeline",
+            "StableDiffusionXLPipeline",
+        ]
+    )
+    _import_structure["t2i_adapter"] = ["StableDiffusionAdapterPipeline", "StableDiffusionXLAdapterPipeline"]
+    _import_structure["text_to_video_synthesis"] = [
+        "TextToVideoSDPipeline",
+        "TextToVideoZeroPipeline",
+        "VideoToVideoSDPipeline",
+    ]
+    _import_structure["unclip"] = ["UnCLIPImageVariationPipeline", "UnCLIPPipeline"]
+    _import_structure["unidiffuser"] = [
+        "ImageTextPipelineOutput",
+        "UniDiffuserModel",
+        "UniDiffuserPipeline",
+        "UniDiffuserTextDecoder",
+    ]
+    _import_structure["versatile_diffusion"] = [
+        "VersatileDiffusionDualGuidedPipeline",
+        "VersatileDiffusionImageVariationPipeline",
+        "VersatileDiffusionPipeline",
+        "VersatileDiffusionTextToImagePipeline",
+    ]
+    _import_structure["vq_diffusion"] = ["VQDiffusionPipeline"]
+    _import_structure["wuerstchen"] = [
+        "WuerstchenCombinedPipeline",
+        "WuerstchenDecoderPipeline",
+        "WuerstchenPriorPipeline",
+    ]
+try:
+    if not is_onnx_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_onnx_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_onnx_objects))
+else:
+    _import_structure["onnx_utils"] = ["OnnxRuntimeModel"]
+try:
+    if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_torch_and_transformers_and_onnx_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_onnx_objects))
+else:
+    _import_structure["stable_diffusion"].extend(
+        [
+            "OnnxStableDiffusionImg2ImgPipeline",
+            "OnnxStableDiffusionInpaintPipeline",
+            "OnnxStableDiffusionInpaintPipelineLegacy",
+            "OnnxStableDiffusionPipeline",
+            "OnnxStableDiffusionUpscalePipeline",
+            "StableDiffusionOnnxPipeline",
+        ]
+    )
+
+try:
+    if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_torch_and_transformers_and_k_diffusion_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_k_diffusion_objects))
+else:
+    _import_structure["stable_diffusion"].extend(["StableDiffusionKDiffusionPipeline"])
+try:
+    if not is_flax_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_flax_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_flax_objects))
+else:
+    _import_structure["pipeline_flax_utils"] = ["FlaxDiffusionPipeline"]
+try:
+    if not (is_flax_available() and is_transformers_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_flax_and_transformers_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_flax_and_transformers_objects))
+else:
+    _import_structure["controlnet"].extend(["FlaxStableDiffusionControlNetPipeline"])
+    _import_structure["stable_diffusion"].extend(
+        [
+            "FlaxStableDiffusionImg2ImgPipeline",
+            "FlaxStableDiffusionInpaintPipeline",
+            "FlaxStableDiffusionPipeline",
+        ]
+    )
+    _import_structure["stable_diffusion_xl"].extend(
+        [
+            "FlaxStableDiffusionXLPipeline",
+        ]
+    )
+try:
+    if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ..utils import dummy_transformers_and_torch_and_note_seq_objects  # noqa F403
+
+    _dummy_objects.update(get_objects_from_module(dummy_transformers_and_torch_and_note_seq_objects))
+else:
+    _import_structure["spectrogram_diffusion"] = ["MidiProcessor", "SpectrogramDiffusionPipeline"]
+
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not is_torch_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_pt_objects import *  # noqa F403
+
+    else:
+        from .auto_pipeline import AutoPipelineForImage2Image, AutoPipelineForInpainting, AutoPipelineForText2Image
+        from .consistency_models import ConsistencyModelPipeline
+        from .dance_diffusion import DanceDiffusionPipeline
+        from .ddim import DDIMPipeline
+        from .ddpm import DDPMPipeline
+        from .dit import DiTPipeline
+        from .latent_diffusion import LDMSuperResolutionPipeline
+        from .latent_diffusion_uncond import LDMPipeline
+        from .pipeline_utils import AudioPipelineOutput, DiffusionPipeline, ImagePipelineOutput
+        from .pndm import PNDMPipeline
+        from .repaint import RePaintPipeline
+        from .score_sde_ve import ScoreSdeVePipeline
+        from .stochastic_karras_ve import KarrasVePipeline
+
+    try:
+        if not (is_torch_available() and is_librosa_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_torch_and_librosa_objects import *
+    else:
+        from .audio_diffusion import AudioDiffusionPipeline, Mel
+
+    try:
+        if not (is_torch_available() and is_transformers_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ..utils.dummy_torch_and_transformers_objects import *
+    else:
+        from .alt_diffusion import AltDiffusionImg2ImgPipeline, AltDiffusionPipeline
+        from .audioldm import AudioLDMPipeline
+        from .audioldm2 import AudioLDM2Pipeline, AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
+        from .blip_diffusion import BlipDiffusionPipeline
+        from .controlnet import (
+            BlipDiffusionControlNetPipeline,
+            StableDiffusionControlNetImg2ImgPipeline,
+            StableDiffusionControlNetInpaintPipeline,
+            StableDiffusionControlNetPipeline,
+            StableDiffusionXLControlNetImg2ImgPipeline,
+            StableDiffusionXLControlNetInpaintPipeline,
+            StableDiffusionXLControlNetPipeline,
+        )
+        from .deepfloyd_if import (
+            IFImg2ImgPipeline,
+            IFImg2ImgSuperResolutionPipeline,
+            IFInpaintingPipeline,
+            IFInpaintingSuperResolutionPipeline,
+            IFPipeline,
+            IFSuperResolutionPipeline,
+        )
+        from .kandinsky import (
+            KandinskyCombinedPipeline,
+            KandinskyImg2ImgCombinedPipeline,
+            KandinskyImg2ImgPipeline,
+            KandinskyInpaintCombinedPipeline,
+            KandinskyInpaintPipeline,
+            KandinskyPipeline,
+            KandinskyPriorPipeline,
+        )
+        from .kandinsky2_2 import (
+            KandinskyV22CombinedPipeline,
+            KandinskyV22ControlnetImg2ImgPipeline,
+            KandinskyV22ControlnetPipeline,
+            KandinskyV22Img2ImgCombinedPipeline,
+            KandinskyV22Img2ImgPipeline,
+            KandinskyV22InpaintCombinedPipeline,
+            KandinskyV22InpaintPipeline,
+            KandinskyV22Pipeline,
+            KandinskyV22PriorEmb2EmbPipeline,
+            KandinskyV22PriorPipeline,
+        )
+        from .latent_diffusion import LDMTextToImagePipeline
+        from .musicldm import MusicLDMPipeline
+        from .paint_by_example import PaintByExamplePipeline
+        from .semantic_stable_diffusion import SemanticStableDiffusionPipeline
+        from .shap_e import ShapEImg2ImgPipeline, ShapEPipeline
+        from .stable_diffusion import (
+            CLIPImageProjection,
+            CycleDiffusionPipeline,
+            StableDiffusionAttendAndExcitePipeline,
+            StableDiffusionDepth2ImgPipeline,
+            StableDiffusionDiffEditPipeline,
+            StableDiffusionGLIGENPipeline,
+            StableDiffusionGLIGENTextImagePipeline,
+            StableDiffusionImageVariationPipeline,
+            StableDiffusionImg2ImgPipeline,
+            StableDiffusionInpaintPipeline,
+            StableDiffusionInpaintPipelineLegacy,
+            StableDiffusionInstructPix2PixPipeline,
+            StableDiffusionLatentUpscalePipeline,
+            StableDiffusionLDM3DPipeline,
+            StableDiffusionModelEditingPipeline,
+            StableDiffusionPanoramaPipeline,
+            StableDiffusionParadigmsPipeline,
+            StableDiffusionPipeline,
+            StableDiffusionPix2PixZeroPipeline,
+            StableDiffusionSAGPipeline,
+            StableDiffusionUpscalePipeline,
+            StableUnCLIPImg2ImgPipeline,
+            StableUnCLIPPipeline,
+        )
+        from .stable_diffusion_safe import StableDiffusionPipelineSafe
+        from .stable_diffusion_xl import (
+            StableDiffusionXLImg2ImgPipeline,
+            StableDiffusionXLInpaintPipeline,
+            StableDiffusionXLInstructPix2PixPipeline,
+            StableDiffusionXLPipeline,
+        )
+        from .t2i_adapter import StableDiffusionAdapterPipeline, StableDiffusionXLAdapterPipeline
+        from .text_to_video_synthesis import (
+            TextToVideoSDPipeline,
+            TextToVideoZeroPipeline,
+            VideoToVideoSDPipeline,
+        )
+        from .unclip import UnCLIPImageVariationPipeline, UnCLIPPipeline
+        from .unidiffuser import (
+            ImageTextPipelineOutput,
+            UniDiffuserModel,
+            UniDiffuserPipeline,
+            UniDiffuserTextDecoder,
+        )
+        from .versatile_diffusion import (
+            VersatileDiffusionDualGuidedPipeline,
+            VersatileDiffusionImageVariationPipeline,
+            VersatileDiffusionPipeline,
+            VersatileDiffusionTextToImagePipeline,
+        )
+        from .vq_diffusion import VQDiffusionPipeline
+        from .wuerstchen import (
+            WuerstchenCombinedPipeline,
+            WuerstchenDecoderPipeline,
+            WuerstchenPriorPipeline,
+        )
+
+        try:
+            if not is_onnx_available():
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_onnx_objects import *  # noqa F403
+
+        else:
+            from .onnx_utils import OnnxRuntimeModel
+
+        try:
+            if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_torch_and_transformers_and_onnx_objects import *
+        else:
+            from .stable_diffusion import (
+                OnnxStableDiffusionImg2ImgPipeline,
+                OnnxStableDiffusionInpaintPipeline,
+                OnnxStableDiffusionInpaintPipelineLegacy,
+                OnnxStableDiffusionPipeline,
+                OnnxStableDiffusionUpscalePipeline,
+                StableDiffusionOnnxPipeline,
+            )
+
+        try:
+            if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_torch_and_transformers_and_k_diffusion_objects import *
+        else:
+            from .stable_diffusion import StableDiffusionKDiffusionPipeline
+
+        try:
+            if not is_flax_available():
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_flax_objects import *  # noqa F403
+        else:
+            from .pipeline_flax_utils import FlaxDiffusionPipeline
+
+        try:
+            if not (is_flax_available() and is_transformers_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_flax_and_transformers_objects import *
+        else:
+            from .controlnet import FlaxStableDiffusionControlNetPipeline
+            from .stable_diffusion import (
+                FlaxStableDiffusionImg2ImgPipeline,
+                FlaxStableDiffusionInpaintPipeline,
+                FlaxStableDiffusionPipeline,
+            )
+            from .stable_diffusion_xl import (
+                FlaxStableDiffusionXLPipeline,
+            )
+
+        try:
+            if not (is_transformers_available() and is_torch_available() and is_note_seq_available()):
+                raise OptionalDependencyNotAvailable()
+        except OptionalDependencyNotAvailable:
+            from ..utils.dummy_transformers_and_torch_and_note_seq_objects import *  # noqa F403
+
+        else:
+            from .spectrogram_diffusion import MidiProcessor, SpectrogramDiffusionPipeline
+
+else:
+    import sys
+
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)
@@ -1,6 +1,7 @@
 from typing import TYPE_CHECKING

 from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
    OptionalDependencyNotAvailable,
    _LazyModule,
    get_objects_from_module,
@@ -26,7 +27,7 @@ else:

    _import_structure["pipeline_output"] = ["AltDiffusionPipelineOutput"]

-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
        if not (is_transformers_available() and is_torch_available()):
            raise OptionalDependencyNotAvailable()
@@ -29,7 +29,8 @@ from ...utils import deprecate, logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation
+from .modeling_roberta_series import RobertaSeriesModelWithTransformation
+from .pipeline_output import AltDiffusionPipelineOutput


 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -16,7 +16,7 @@ import inspect
 from typing import Any, Callable, Dict, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from packaging import version
 from transformers import CLIPImageProcessor, XLMRobertaTokenizer
@@ -31,7 +31,8 @@ from ...utils import PIL_INTERPOLATION, deprecate, logging, replace_example_docs
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
-from . import AltDiffusionPipelineOutput, RobertaSeriesModelWithTransformation
+from .modeling_roberta_series import RobertaSeriesModelWithTransformation
+from .pipeline_output import AltDiffusionPipelineOutput


 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -2,7 +2,7 @@ from dataclasses import dataclass
 from typing import List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image

 from ...utils import (
    BaseOutput,
@@ -1,6 +1,6 @@
 from typing import TYPE_CHECKING

-from ...utils import _LazyModule
+from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule


 _import_structure = {
@@ -8,7 +8,7 @@ _import_structure = {
    "pipeline_audio_diffusion": ["AudioDiffusionPipeline"],
 }

-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    from .mel import Mel
    from .pipeline_audio_diffusion import AudioDiffusionPipeline

@@ -1,6 +1,7 @@
 from typing import TYPE_CHECKING

 from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
    OptionalDependencyNotAvailable,
    _LazyModule,
    is_torch_available,
@@ -25,7 +26,7 @@ else:
    _import_structure["pipeline_audioldm"] = ["AudioLDMPipeline"]


-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
            raise OptionalDependencyNotAvailable()
@@ -1,6 +1,7 @@
 from typing import TYPE_CHECKING

 from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
    OptionalDependencyNotAvailable,
    _LazyModule,
    get_objects_from_module,
@@ -25,7 +26,7 @@ else:
    _import_structure["pipeline_audioldm2"] = ["AudioLDM2Pipeline"]


-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
            raise OptionalDependencyNotAvailable()
@@ -38,7 +38,7 @@ from diffusers.utils import numpy_to_pil


 if is_vision_available():
-    import PIL
+    import PIL.Image


 logger = logging.get_logger(__name__)
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import List, Optional, Union

-import PIL
+import PIL.Image
 import torch
 from transformers import CLIPTokenizer

@@ -98,6 +98,8 @@ class BlipDiffusionPipeline(DiffusionPipeline):
            Position of the context token in the text encoder.
    """

+    model_cpu_offload_seq = "qformer->text_encoder->unet->vae"
+
    def __init__(
        self,
        tokenizer: CLIPTokenizer,
@@ -155,7 +157,9 @@ class BlipDiffusionPipeline(DiffusionPipeline):
        latents = latents * self.scheduler.init_noise_sigma
        return latents

-    def encode_prompt(self, query_embeds, prompt):
+    def encode_prompt(self, query_embeds, prompt, device=None):
+        device = device or self._execution_device
+
        # embeddings for prompt, with query_embeds as context
        max_len = self.text_encoder.text_model.config.max_position_embeddings
        max_len -= self.qformer.config.num_query_tokens
@@ -166,7 +170,7 @@ class BlipDiffusionPipeline(DiffusionPipeline):
            truncation=True,
            max_length=max_len,
            return_tensors="pt",
-        ).to(self.device)
+        ).to(device)

        batch_size = query_embeds.shape[0]
        ctx_begin_pos = [self.config.ctx_begin_pos] * batch_size
@@ -249,11 +253,12 @@ class BlipDiffusionPipeline(DiffusionPipeline):
        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        """
+        device = self._execution_device

        reference_image = self.image_processor.preprocess(
            reference_image, image_mean=self.config.mean, image_std=self.config.std, return_tensors="pt"
        )["pixel_values"]
-        reference_image = reference_image.to(self.device)
+        reference_image = reference_image.to(device)

        if isinstance(prompt, str):
            prompt = [prompt]
@@ -271,7 +276,7 @@ class BlipDiffusionPipeline(DiffusionPipeline):
            prompt_reps=prompt_reps,
        )
        query_embeds = self.get_query_embeddings(reference_image, source_subject_category)
-        text_embeddings = self.encode_prompt(query_embeds, prompt)
+        text_embeddings = self.encode_prompt(query_embeds, prompt, device)
        do_classifier_free_guidance = guidance_scale > 1.0
        if do_classifier_free_guidance:
            max_length = self.text_encoder.text_model.config.max_position_embeddings
@@ -283,7 +288,7 @@ class BlipDiffusionPipeline(DiffusionPipeline):
                return_tensors="pt",
            )
            uncond_embeddings = self.text_encoder(
-                input_ids=uncond_input.input_ids.to(self.device),
+                input_ids=uncond_input.input_ids.to(device),
                ctx_embeddings=None,
            )[0]
            # For classifier free guidance, we need to do two forward passes.
@@ -300,7 +305,7 @@ class BlipDiffusionPipeline(DiffusionPipeline):
            generator=generator,
            latents=latents,
            dtype=self.unet.dtype,
-            device=self.device,
+            device=device,
        )
        # set timesteps
        extra_set_kwargs = {}
@@ -330,9 +335,13 @@ class BlipDiffusionPipeline(DiffusionPipeline):
                t,
                latents,
            )["prev_sample"]
+
        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
        image = self.image_processor.postprocess(image, output_type=output_type)

+        # Offload all models
+        self.maybe_free_model_hooks()
+
        if not return_dict:
            return (image,)

@@ -1,13 +1,14 @@
 from typing import TYPE_CHECKING

 from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
    _LazyModule,
 )


 _import_structure = {"pipeline_consistency_models": ["ConsistencyModelPipeline"]}

-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    from .pipeline_consistency_models import ConsistencyModelPipeline

 else:
@@ -1,6 +1,7 @@
 from typing import TYPE_CHECKING

 from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
    OptionalDependencyNotAvailable,
    _LazyModule,
    get_objects_from_module,
@@ -40,7 +41,7 @@ else:
    _import_structure["pipeline_flax_controlnet"] = ["FlaxStableDiffusionControlNetPipeline"]


-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
        if not (is_transformers_available() and is_torch_available()):
            raise OptionalDependencyNotAvailable()
@@ -34,7 +34,7 @@ from ...utils import (
 )
 from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
-from ..stable_diffusion import StableDiffusionPipelineOutput
+from ..stable_diffusion.pipeline_output import StableDiffusionPipelineOutput
 from ..stable_diffusion.safety_checker import StableDiffusionSafetyChecker
 from .multicontrolnet import MultiControlNetModel

@@ -14,7 +14,7 @@
 # limitations under the License.
 from typing import List, Optional, Union

-import PIL
+import PIL.Image
 import torch
 from transformers import CLIPTokenizer

@@ -51,7 +51,7 @@ EXAMPLE_DOC_STRING = """

        >>> cldm_cond_image = load_image(
        ...     "https://huggingface.co/datasets/ayushtues/blipdiffusion_images/resolve/main/kettle.jpg"
-        ... ).resize(512, 512)
+        ... ).resize((512, 512))
        >>> canny = CannyDetector()
        >>> cldm_cond_image = canny(cldm_cond_image, 30, 70, output_type="pil")
        >>> style_image = load_image(
@@ -107,6 +107,8 @@ class BlipDiffusionControlNetPipeline(DiffusionPipeline):
            Position of the context token in the text encoder.
    """

+    model_cpu_offload_seq = "qformer->text_encoder->unet->vae"
+
    def __init__(
        self,
        tokenizer: CLIPTokenizer,
@@ -166,7 +168,9 @@ class BlipDiffusionControlNetPipeline(DiffusionPipeline):
        latents = latents * self.scheduler.init_noise_sigma
        return latents

-    def encode_prompt(self, query_embeds, prompt):
+    def encode_prompt(self, query_embeds, prompt, device=None):
+        device = device or self._execution_device
+
        # embeddings for prompt, with query_embeds as context
        max_len = self.text_encoder.text_model.config.max_position_embeddings
        max_len -= self.qformer.config.num_query_tokens
@@ -177,7 +181,7 @@ class BlipDiffusionControlNetPipeline(DiffusionPipeline):
            truncation=True,
            max_length=max_len,
            return_tensors="pt",
-        ).to(self.device)
+        ).to(device)

        batch_size = query_embeds.shape[0]
        ctx_begin_pos = [self.config.ctx_begin_pos] * batch_size
@@ -297,11 +301,12 @@ class BlipDiffusionControlNetPipeline(DiffusionPipeline):
        Returns:
            [`~pipelines.ImagePipelineOutput`] or `tuple`
        """
+        device = self._execution_device

        reference_image = self.image_processor.preprocess(
            reference_image, image_mean=self.config.mean, image_std=self.config.std, return_tensors="pt"
        )["pixel_values"]
-        reference_image = reference_image.to(self.device)
+        reference_image = reference_image.to(device)

        if isinstance(prompt, str):
            prompt = [prompt]
@@ -319,7 +324,7 @@ class BlipDiffusionControlNetPipeline(DiffusionPipeline):
            prompt_reps=prompt_reps,
        )
        query_embeds = self.get_query_embeddings(reference_image, source_subject_category)
-        text_embeddings = self.encode_prompt(query_embeds, prompt)
+        text_embeddings = self.encode_prompt(query_embeds, prompt, device)
        # 3. unconditional embedding
        do_classifier_free_guidance = guidance_scale > 1.0
        if do_classifier_free_guidance:
@@ -332,7 +337,7 @@ class BlipDiffusionControlNetPipeline(DiffusionPipeline):
                return_tensors="pt",
            )
            uncond_embeddings = self.text_encoder(
-                input_ids=uncond_input.input_ids.to(self.device),
+                input_ids=uncond_input.input_ids.to(device),
                ctx_embeddings=None,
            )[0]
            # For classifier free guidance, we need to do two forward passes.
@@ -348,7 +353,7 @@ class BlipDiffusionControlNetPipeline(DiffusionPipeline):
            generator=generator,
            latents=latents,
            dtype=self.unet.dtype,
-            device=self.device,
+            device=device,
        )
        # set timesteps
        extra_set_kwargs = {}
@@ -399,6 +404,9 @@ class BlipDiffusionControlNetPipeline(DiffusionPipeline):
        image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
        image = self.image_processor.postprocess(image, output_type=output_type)

+        # Offload all models
+        self.maybe_free_model_hooks()
+
        if not return_dict:
            return (image,)

@@ -16,13 +16,11 @@ import inspect
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 import torch.nn.functional as F
 from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer

-from diffusers.pipelines.stable_diffusion_xl import StableDiffusionXLPipelineOutput
-
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, UNet2DConditionModel
@@ -41,6 +39,7 @@ from ...utils import (
 )
 from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput
 from .multicontrolnet import MultiControlNetModel


@@ -316,7 +315,7 @@ class StableDiffusionXLControlNetInpaintPipeline(

            # dynamically adjust the LoRA scale
            adjust_lora_scale_text_encoder(self.text_encoder, lora_scale, self.use_peft_backend)
-            adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+            adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale, self.use_peft_backend)

        prompt = [prompt] if isinstance(prompt, str) else prompt

@@ -41,7 +41,7 @@ from ...utils import (
 )
 from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
-from ..stable_diffusion_xl import StableDiffusionXLPipelineOutput
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput


 if is_invisible_watermark_available():
@@ -289,7 +289,7 @@ class StableDiffusionXLControlNetPipeline(

            # dynamically adjust the LoRA scale
            adjust_lora_scale_text_encoder(self.text_encoder, lora_scale, self.use_peft_backend)
-            adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+            adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale, self.use_peft_backend)

        prompt = [prompt] if isinstance(prompt, str) else prompt

@@ -863,7 +863,7 @@ class StableDiffusionXLControlNetPipeline(
                The percentage of total steps at which the ControlNet stops applying.
            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
-                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
                explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
@@ -873,7 +873,7 @@ class StableDiffusionXLControlNetPipeline(
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
                For most cases, `target_size` should be set to the desired height and width of the generated image. If
-                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
@@ -41,7 +41,7 @@ from ...utils import (
 )
 from ...utils.torch_utils import is_compiled_module, randn_tensor
 from ..pipeline_utils import DiffusionPipeline
-from ..stable_diffusion_xl import StableDiffusionXLPipelineOutput
+from ..stable_diffusion_xl.pipeline_output import StableDiffusionXLPipelineOutput


 if is_invisible_watermark_available():
@@ -327,7 +327,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline(

            # dynamically adjust the LoRA scale
            adjust_lora_scale_text_encoder(self.text_encoder, lora_scale, self.use_peft_backend)
-            adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale)
+            adjust_lora_scale_text_encoder(self.text_encoder_2, lora_scale, self.use_peft_backend)

        prompt = [prompt] if isinstance(prompt, str) else prompt

@@ -1028,7 +1028,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
                The percentage of total steps at which the controlnet stops applying.
            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
-                `original_size` defaults to `(width, height)` if not specified. Part of SDXL's micro-conditioning as
+                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
                explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
@@ -1038,7 +1038,7 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
                For most cases, `target_size` should be set to the desired height and width of the generated image. If
-                not specified it will default to `(width, height)`. Part of SDXL's micro-conditioning as explained in
+                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
@@ -1,11 +1,11 @@
 from typing import TYPE_CHECKING

-from ...utils import _LazyModule
+from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule


 _import_structure = {"pipeline_dance_diffusion": ["DanceDiffusionPipeline"]}

-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    from .pipeline_dance_diffusion import DanceDiffusionPipeline
 else:
    import sys
@@ -1,11 +1,11 @@
 from typing import TYPE_CHECKING

-from ...utils import _LazyModule
+from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule


 _import_structure = {"pipeline_ddim": ["DDIMPipeline"]}

-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    from .pipeline_ddim import DDIMPipeline
 else:
    import sys
@@ -1,13 +1,14 @@
 from typing import TYPE_CHECKING

 from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
    _LazyModule,
 )


 _import_structure = {"pipeline_ddpm": ["DDPMPipeline"]}

-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    from .pipeline_ddpm import DDPMPipeline

 else:
@@ -1,6 +1,7 @@
 from typing import TYPE_CHECKING

 from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
    OptionalDependencyNotAvailable,
    _LazyModule,
    get_objects_from_module,
@@ -42,7 +43,7 @@ else:
    _import_structure["watermark"] = ["IFWatermarker"]


-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
        if not (is_transformers_available() and is_torch_available()):
            raise OptionalDependencyNotAvailable()
@@ -20,7 +20,7 @@ from ...utils import (
 )
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
-from . import IFPipelineOutput
+from .pipeline_output import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
 from .watermark import IFWatermarker

@@ -5,7 +5,7 @@ import urllib.parse as ul
 from typing import Any, Callable, Dict, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer

@@ -23,7 +23,7 @@ from ...utils import (
 )
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
-from . import IFPipelineOutput
+from .pipeline_output import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
 from .watermark import IFWatermarker

@@ -5,7 +5,7 @@ import urllib.parse as ul
 from typing import Any, Callable, Dict, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 import torch.nn.functional as F
 from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
@@ -24,7 +24,7 @@ from ...utils import (
 )
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
-from . import IFPipelineOutput
+from .pipeline_output import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
 from .watermark import IFWatermarker

@@ -5,7 +5,7 @@ import urllib.parse as ul
 from typing import Any, Callable, Dict, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer

@@ -23,7 +23,7 @@ from ...utils import (
 )
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
-from . import IFPipelineOutput
+from .pipeline_output import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
 from .watermark import IFWatermarker

@@ -5,7 +5,7 @@ import urllib.parse as ul
 from typing import Any, Callable, Dict, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 import torch.nn.functional as F
 from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
@@ -24,7 +24,7 @@ from ...utils import (
 )
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
-from . import IFPipelineOutput
+from .pipeline_output import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
 from .watermark import IFWatermarker

@@ -5,7 +5,7 @@ import urllib.parse as ul
 from typing import Any, Callable, Dict, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 import torch.nn.functional as F
 from transformers import CLIPImageProcessor, T5EncoderModel, T5Tokenizer
@@ -23,7 +23,7 @@ from ...utils import (
 )
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
-from . import IFPipelineOutput
+from .pipeline_output import IFPipelineOutput
 from .safety_checker import IFSafetyChecker
 from .watermark import IFWatermarker

@@ -2,7 +2,7 @@ from dataclasses import dataclass
 from typing import List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image

 from ...utils import BaseOutput

@@ -1,6 +1,6 @@
 from typing import List

-import PIL
+import PIL.Image
 import torch
 from PIL import Image

@@ -1,11 +1,11 @@
 from typing import TYPE_CHECKING

-from ...utils import _LazyModule
+from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule


 _import_structure = {"pipeline_dit": ["DiTPipeline"]}

-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    from .pipeline_dit import DiTPipeline

 else:
@@ -1,6 +1,7 @@
 from typing import TYPE_CHECKING

 from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
    OptionalDependencyNotAvailable,
    _LazyModule,
    get_objects_from_module,
@@ -32,7 +33,7 @@ else:
    _import_structure["text_encoder"] = ["MultilingualCLIP"]


-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
        if not (is_transformers_available() and is_torch_available()):
            raise OptionalDependencyNotAvailable()
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import Callable, List, Optional, Union

-import PIL
+import PIL.Image
 import torch
 from transformers import (
    CLIPImageProcessor,
@@ -15,7 +15,7 @@
 from typing import Callable, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from PIL import Image
 from transformers import (
@@ -16,7 +16,7 @@ from copy import deepcopy
 from typing import Callable, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 import torch.nn.functional as F
 from packaging import version
@@ -16,7 +16,7 @@ from dataclasses import dataclass
 from typing import List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection

@@ -1,6 +1,7 @@
 from typing import TYPE_CHECKING

 from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
    OptionalDependencyNotAvailable,
    _LazyModule,
    get_objects_from_module,
@@ -34,7 +35,7 @@ else:
    _import_structure["pipeline_kandinsky2_2_prior_emb2emb"] = ["KandinskyV22PriorEmb2EmbPipeline"]


-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
        if not (is_transformers_available() and is_torch_available()):
            raise OptionalDependencyNotAvailable()
@@ -14,7 +14,7 @@

 from typing import Callable, List, Optional, Union

-import PIL
+import PIL.Image
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection

@@ -15,7 +15,7 @@
 from typing import Callable, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from PIL import Image

@@ -15,7 +15,7 @@
 from typing import Callable, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from PIL import Image

@@ -16,7 +16,7 @@ from copy import deepcopy
 from typing import Callable, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 import torch.nn.functional as F
 from packaging import version
@@ -1,6 +1,6 @@
 from typing import List, Optional, Union

-import PIL
+import PIL.Image
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection

@@ -1,6 +1,6 @@
 from typing import List, Optional, Union

-import PIL
+import PIL.Image
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModelWithProjection, CLIPTokenizer, CLIPVisionModelWithProjection

@@ -1,6 +1,7 @@
 from typing import TYPE_CHECKING

 from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
    OptionalDependencyNotAvailable,
    _LazyModule,
    get_objects_from_module,
@@ -24,7 +25,7 @@ else:
    _import_structure["pipeline_latent_diffusion_superresolution"] = ["LDMSuperResolutionPipeline"]


-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
        if not (is_transformers_available() and is_torch_available()):
            raise OptionalDependencyNotAvailable()
@@ -2,7 +2,7 @@ import inspect
 from typing import List, Optional, Tuple, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 import torch.utils.checkpoint

@@ -1,11 +1,11 @@
 from typing import TYPE_CHECKING

-from ...utils import _LazyModule
+from ...utils import DIFFUSERS_SLOW_IMPORT, _LazyModule


 _import_structure = {"pipeline_latent_diffusion_uncond": ["LDMPipeline"]}

-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    from .pipeline_latent_diffusion_uncond import LDMPipeline
 else:
    import sys
@@ -1,6 +1,7 @@
 from typing import TYPE_CHECKING

 from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
    OptionalDependencyNotAvailable,
    _LazyModule,
    get_objects_from_module,
@@ -24,7 +25,7 @@ else:
    _import_structure["pipeline_musicldm"] = ["MusicLDMPipeline"]


-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
        if not (is_transformers_available() and is_torch_available() and is_transformers_version(">=", "4.27.0")):
            raise OptionalDependencyNotAvailable()
@@ -6,6 +6,7 @@ import PIL
 from PIL import Image

 from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
    OptionalDependencyNotAvailable,
    _LazyModule,
    get_objects_from_module,
@@ -29,7 +30,7 @@ else:
    _import_structure["pipeline_paint_by_example"] = ["PaintByExamplePipeline"]


-if TYPE_CHECKING:
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    try:
        if not (is_transformers_available() and is_torch_available()):
            raise OptionalDependencyNotAvailable()
@@ -16,7 +16,7 @@ import inspect
 from typing import Callable, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from transformers import CLIPImageProcessor

@@ -21,7 +21,7 @@ from typing import Any, Dict, List, Optional, Union

 import flax
 import numpy as np
-import PIL
+import PIL.Image
 from flax.core.frozen_dict import FrozenDict
 from huggingface_hub import create_repo, snapshot_download
 from PIL import Image
@@ -394,10 +394,29 @@ class FlaxDiffusionPipeline(ConfigMixin, PushToHubMixin):
        # extract them here
        expected_modules, optional_kwargs = cls._get_signature_keys(pipeline_class)
        passed_class_obj = {k: kwargs.pop(k) for k in expected_modules if k in kwargs}
+        passed_pipe_kwargs = {k: kwargs.pop(k) for k in optional_kwargs if k in kwargs}

-        init_dict, _, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)
+        init_dict, unused_kwargs, _ = pipeline_class.extract_init_dict(config_dict, **kwargs)

-        init_kwargs = {}
+        # define init kwargs
+        init_kwargs = {k: init_dict.pop(k) for k in optional_kwargs if k in init_dict}
+        init_kwargs = {**init_kwargs, **passed_pipe_kwargs}
+
+        # remove `null` components
+        def load_module(name, value):
+            if value[0] is None:
+                return False
+            if name in passed_class_obj and passed_class_obj[name] is None:
+                return False
+            return True
+
+        init_dict = {k: v for k, v in init_dict.items() if load_module(k, v)}
+
+        # Throw nice warnings / errors for fast accelerate loading
+        if len(unused_kwargs) > 0:
+            logger.warning(
+                f"Keyword arguments {unused_kwargs} are not expected by {pipeline_class.__name__} and will be ignored."
+            )

        # inference_params
        params = {}
@@ -26,7 +26,7 @@ from pathlib import Path
 from typing import Any, Callable, Dict, List, Optional, Union

 import numpy as np
-import PIL
+import PIL.Image
 import torch
 from huggingface_hub import ModelCard, create_repo, hf_hub_download, model_info, snapshot_download
 from packaging import version
@@ -670,14 +670,98 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                create_pr=create_pr,
            )

-    def to(
-        self,
-        torch_device: Optional[Union[str, torch.device]] = None,
-        torch_dtype: Optional[torch.dtype] = None,
-        silence_dtype_warnings: bool = False,
-    ):
-        if torch_device is None and torch_dtype is None:
-            return self
+    def to(self, *args, **kwargs):
+        r"""
+        Performs Pipeline dtype and/or device conversion. A torch.dtype and torch.device are inferred from the
+        arguments of `self.to(*args, **kwargs).`
+
+        <Tip>
+
+            If the pipeline already has the correct torch.dtype and torch.device, then it is returned as is. Otherwise,
+            the returned pipeline is a copy of self with the desired torch.dtype and torch.device.
+
+        </Tip>
+
+
+        Here are the ways to call `to`:
+
+        - `to(dtype, silence_dtype_warnings=False) → DiffusionPipeline` to return a pipeline with the specified
+          [`dtype`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype)
+        - `to(device, silence_dtype_warnings=False) → DiffusionPipeline` to return a pipeline with the specified
+          [`device`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.device)
+        - `to(device=None, dtype=None, silence_dtype_warnings=False) → DiffusionPipeline` to return a pipeline with the
+          specified [`device`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.device) and
+          [`dtype`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype)
+
+        Arguments:
+            dtype (`torch.dtype`, *optional*):
+                Returns a pipeline with the specified
+                [`dtype`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.dtype)
+            device (`torch.Device`, *optional*):
+                Returns a pipeline with the specified
+                [`device`](https://pytorch.org/docs/stable/tensor_attributes.html#torch.device)
+            silence_dtype_warnings (`str`, *optional*, defaults to `False`):
+                Whether to omit warnings if the target `dtype` is not compatible with the target `device`.
+
+        Returns:
+            [`DiffusionPipeline`]: The pipeline converted to specified `dtype` and/or `dtype`.
+        """
+
+        torch_dtype = kwargs.pop("torch_dtype", None)
+        if torch_dtype is not None:
+            deprecate("torch_dtype", "0.25.0", "")
+        torch_device = kwargs.pop("torch_device", None)
+        if torch_device is not None:
+            deprecate("torch_device", "0.25.0", "")
+
+        dtype_kwarg = kwargs.pop("dtype", None)
+        device_kwarg = kwargs.pop("device", None)
+        silence_dtype_warnings = kwargs.pop("silence_dtype_warnings", False)
+
+        if torch_dtype is not None and dtype_kwarg is not None:
+            raise ValueError(
+                "You have passed both `torch_dtype` and `dtype` as a keyword argument. Please make sure to only pass `dtype`."
+            )
+
+        dtype = torch_dtype or dtype_kwarg
+
+        if torch_device is not None and device_kwarg is not None:
+            raise ValueError(
+                "You have passed both `torch_device` and `device` as a keyword argument. Please make sure to only pass `device`."
+            )
+
+        device = torch_device or device_kwarg
+
+        dtype_arg = None
+        device_arg = None
+        if len(args) == 1:
+            if isinstance(args[0], torch.dtype):
+                dtype_arg = args[0]
+            else:
+                device_arg = torch.device(args[0]) if args[0] is not None else None
+        elif len(args) == 2:
+            if isinstance(args[0], torch.dtype):
+                raise ValueError(
+                    "When passing two arguments, make sure the first corresponds to `device` and the second to `dtype`."
+                )
+            device_arg = torch.device(args[0]) if args[0] is not None else None
+            dtype_arg = args[1]
+        elif len(args) > 2:
+            raise ValueError("Please make sure to pass at most two arguments (`device` and `dtype`) `.to(...)`")
+
+        if dtype is not None and dtype_arg is not None:
+            raise ValueError(
+                "You have passed `dtype` both as an argument and as a keyword argument. Please only pass one of the two."
+            )
+
+        dtype = dtype or dtype_arg
+
+        if device is not None and device_arg is not None:
+            raise ValueError(
+                "You have passed `device` both as an argument and as a keyword argument. Please only pass one of the two."
+            )
+
+        device = device or device_arg

        # throw warning if pipeline is in "offloaded"-mode but user tries to manually set to GPU.
        def module_is_sequentially_offloaded(module):
@@ -698,14 +782,14 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        pipeline_is_sequentially_offloaded = any(
            module_is_sequentially_offloaded(module) for _, module in self.components.items()
        )
-        if pipeline_is_sequentially_offloaded and torch_device and torch.device(torch_device).type == "cuda":
+        if pipeline_is_sequentially_offloaded and device and torch.device(device).type == "cuda":
            raise ValueError(
                "It seems like you have activated sequential model offloading by calling `enable_sequential_cpu_offload`, but are now attempting to move the pipeline to GPU. This is not compatible with offloading. Please, move your pipeline `.to('cpu')` or consider removing the move altogether if you use sequential offloading."
            )

        # Display a warning in this case (the operation succeeds but the benefits are lost)
        pipeline_is_offloaded = any(module_is_offloaded(module) for _, module in self.components.items())
-        if pipeline_is_offloaded and torch_device and torch.device(torch_device).type == "cuda":
+        if pipeline_is_offloaded and device and torch.device(device).type == "cuda":
            logger.warning(
                f"It seems like you have activated model offloading by calling `enable_model_cpu_offload`, but are now manually moving the pipeline to GPU. It is strongly recommended against doing so as memory gains from offloading are likely to be lost. Offloading automatically takes care of moving the individual components {', '.join(self.components.keys())} to GPU when needed. To make sure offloading works as expected, you should consider moving the pipeline back to CPU: `pipeline.to('cpu')` or removing the move altogether if you use offloading."
            )
@@ -718,26 +802,26 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        for module in modules:
            is_loaded_in_8bit = hasattr(module, "is_loaded_in_8bit") and module.is_loaded_in_8bit

-            if is_loaded_in_8bit and torch_dtype is not None:
+            if is_loaded_in_8bit and dtype is not None:
                logger.warning(
                    f"The module '{module.__class__.__name__}' has been loaded in 8bit and conversion to {torch_dtype} is not yet supported. Module is still in 8bit precision."
                )

-            if is_loaded_in_8bit and torch_device is not None:
+            if is_loaded_in_8bit and device is not None:
                logger.warning(
                    f"The module '{module.__class__.__name__}' has been loaded in 8bit and moving it to {torch_dtype} via `.to()` is not yet supported. Module is still on {module.device}."
                )
            else:
-                module.to(torch_device, torch_dtype)
+                module.to(device, dtype)

            if (
                module.dtype == torch.float16
-                and str(torch_device) in ["cpu"]
+                and str(device) in ["cpu"]
                and not silence_dtype_warnings
                and not is_offloaded
            ):
                logger.warning(
-                    "Pipelines loaded with `torch_dtype=torch.float16` cannot run with `cpu` device. It"
+                    "Pipelines loaded with `dtype=torch.float16` cannot run with `cpu` device. It"
                    " is not recommended to move them to `cpu` as running them will fail. Please make"
                    " sure to use an accelerator to run the pipeline in inference, due to the lack of"
                    " support for`float16` operations on this device in PyTorch. Please, remove the"
@@ -760,6 +844,21 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):

        return torch.device("cpu")

+    @property
+    def dtype(self) -> torch.dtype:
+        r"""
+        Returns:
+            `torch.dtype`: The torch dtype on which the pipeline is located.
+        """
+        module_names, _ = self._get_signature_keys(self)
+        modules = [getattr(self, n, None) for n in module_names]
+        modules = [m for m in modules if isinstance(m, torch.nn.Module)]
+
+        for module in modules:
+            return module.dtype
+
+        return torch.float32
+
    @classmethod
    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
        r"""
@@ -934,6 +1033,11 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        # 1. Download the checkpoints and configs
        # use snapshot download here to get it working from from_pretrained
        if not os.path.isdir(pretrained_model_name_or_path):
+            if pretrained_model_name_or_path.count("/") > 1:
+                raise ValueError(
+                    f'The provided pretrained_model_name_or_path "{pretrained_model_name_or_path}"'
+                    " is neither a valid local path nor a valid repo id. Please check the parameter."
+                )
            cached_folder = cls.download(
                pretrained_model_name_or_path,
                cache_dir=cache_dir,
@@ -1222,12 +1326,19 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                    return torch.device(module._hf_hook.execution_device)
        return self.device

-    def enable_model_cpu_offload(self, gpu_id: int = 0, device: Union[torch.device, str] = "cuda"):
+    def enable_model_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
        r"""
        Offloads all models to CPU using accelerate, reducing memory usage with a low impact on performance. Compared
        to `enable_sequential_cpu_offload`, this method moves one whole model at a time to the GPU when its `forward`
        method is called, and the model remains in GPU until the next model runs. Memory savings are lower than with
        `enable_sequential_cpu_offload`, but performance is much better due to the iterative execution of the `unet`.
+
+        Arguments:
+            gpu_id (`int`, *optional*):
+                The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
+            device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
+                The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
+                default to "cuda".
        """
        if self.model_cpu_offload_seq is None:
            raise ValueError(
@@ -1239,7 +1350,20 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        else:
            raise ImportError("`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher.")

-        device = torch.device(f"cuda:{gpu_id}")
+        torch_device = torch.device(device)
+        device_index = torch_device.index
+
+        if gpu_id is not None and device_index is not None:
+            raise ValueError(
+                f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
+                f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
+            )
+
+        # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
+        self._offload_gpu_id = gpu_id or torch_device.index or self._offload_gpu_id or 0
+
+        device_type = torch_device.type
+        device = torch.device(f"{device_type}:{self._offload_gpu_id}")

        if self.device.type != "cpu":
            self.to("cpu", silence_dtype_warnings=True)
@@ -1274,7 +1398,10 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):

    def maybe_free_model_hooks(self):
        r"""
-        TODO: Better doc string
+        Function that offloads all components, removes all model hooks that were added when using
+        `enable_model_cpu_offload` and then applies them again. In case the model has not been offloaded this function
+        is a no-op. Make sure to add this function to the end of the `__call__` function of your pipeline so that it
+        functions correctly when applying enable_model_cpu_offload.
        """
        if not hasattr(self, "_all_hooks") or len(self._all_hooks) == 0:
            # `enable_model_cpu_offload` has not be called, so silently do nothing
@@ -1288,21 +1415,40 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        # make sure the model is in the same state as before calling it
        self.enable_model_cpu_offload()

-    def enable_sequential_cpu_offload(self, gpu_id: int = 0, device: Union[torch.device, str] = "cuda"):
+    def enable_sequential_cpu_offload(self, gpu_id: Optional[int] = None, device: Union[torch.device, str] = "cuda"):
        r"""
        Offloads all models to CPU using 🤗 Accelerate, significantly reducing memory usage. When called, the state
        dicts of all `torch.nn.Module` components (except those in `self._exclude_from_cpu_offload`) are saved to CPU
        and then moved to `torch.device('meta')` and loaded to GPU only when their specific submodule has its `forward`
        method called. Offloading happens on a submodule basis. Memory savings are higher than with
        `enable_model_cpu_offload`, but performance is lower.
+
+        Arguments:
+            gpu_id (`int`, *optional*):
+                The ID of the accelerator that shall be used in inference. If not specified, it will default to 0.
+            device (`torch.Device` or `str`, *optional*, defaults to "cuda"):
+                The PyTorch device type of the accelerator that shall be used in inference. If not specified, it will
+                default to "cuda".
        """
        if is_accelerate_available() and is_accelerate_version(">=", "0.14.0"):
            from accelerate import cpu_offload
        else:
            raise ImportError("`enable_sequential_cpu_offload` requires `accelerate v0.14.0` or higher")

-        if device == "cuda":
-            device = torch.device(f"{device}:{gpu_id}")
+        torch_device = torch.device(device)
+        device_index = torch_device.index
+
+        if gpu_id is not None and device_index is not None:
+            raise ValueError(
+                f"You have passed both `gpu_id`={gpu_id} and an index as part of the passed device `device`={device}"
+                f"Cannot pass both. Please make sure to either not define `gpu_id` or not pass the index as part of the device: `device`={torch_device.type}"
+            )
+
+        # _offload_gpu_id should be set to passed gpu_id (or id in passed `device`) or default to previously set id or default to 0
+        self._offload_gpu_id = gpu_id or torch_device.index or self._offload_gpu_id or 0
+
+        device_type = torch_device.type
+        device = torch.device(f"{device_type}:{self._offload_gpu_id}")

        if self.device.type != "cpu":
            self.to("cpu", silence_dtype_warnings=True)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Patrick von Platen	c16ecac3c7	debug	2023-09-26 16:53:20 +02:00
Patrick von Platen	2fedbbf9af	finish	2023-09-26 15:57:10 +02:00
Patrick von Platen	234600ce03	fix SDXL flax init	2023-09-26 15:54:11 +02:00
Ernie Chu	21e402faa0	fix-VaeImageProcessor-docstring (#5182 ) ``` do_binarize (`bool`, optional, defaults to `True`) \| v do_binarize (`bool`, optional, defaults to `False`) ```	2023-09-26 15:06:45 +02:00
Bagheera	4a06c74547	Min-SNR Gamma: follow-up fix for zero-terminal SNR models on v-prediction or epsilon (#5177 ) * merge with main * fix flax example * fix onnx example --------- Co-authored-by: bghira <bghira@users.github.com> Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2023-09-26 18:14:52 +05:30
Bagheera	89d8f84893	Timestep bias for fine-tuning SDXL (#5094 ) * Timestep bias for fine-tuning SDXL * Adjust parameter choices to include "range" and reword the help statements * Condition our use of weighted timesteps on the value of timestep_bias_strategy * style --------- Co-authored-by: bghira <bghira@users.github.com> Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2023-09-26 13:45:37 +05:30
Dhruv Nair	bdd2544673	Tests compile fixes (#5148 ) * test fix * fix tests * fix report name --------- Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>	2023-09-26 11:36:46 +05:30
Patrick von Platen	a91a273d0b	[Docs] Try to fix doc builder (#5180 ) * try to fix docs * try to fix docs	2023-09-25 20:24:50 +02:00
Patrick von Platen	bed8aceca1	make style	2023-09-25 20:24:03 +02:00
Ryan Dick	415093335b	Fix the total_downscale_factor returned by FullAdapterXL T2IAdapters (#5134 ) * Fix FullAdapterXL.total_downscale_factor. * Fix incorrect error message in T2IAdapter.__init__(...). * Move IP-Adapter test_total_downscale_factor(...) to pipeline test file (requested in code review). * Add more info to error message about an unsupported T2I-Adapter adapter_type. --------- Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>	2023-09-25 20:23:14 +02:00
Hengwen Tong	dfdf85d32c	[pipeline utils] sanitize pretrained_model_name_or_path (#5173 ) Make sure the repo_id is valid before sending it to huggingface_hub to get a more understandable error message. Re #5110 Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>	2023-09-25 20:22:41 +02:00
Bagheera	539846a7d5	SDXL microconditioning documentation should indicate the correct default order of parameters, so that developers know (#5155 ) * SDXL microconditioning documentation should indicate the correct default order of parameters, so that developers know * SDXL microconditioning documentation should indicate the correct default order of parameters, so that developers know * empty --------- Co-authored-by: bghira <bghira@users.github.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>	2023-09-25 20:22:09 +02:00
Patrick von Platen	d70944bf7f	fix docs	2023-09-25 19:55:49 +02:00
Patrick von Platen	589cd8100b	make style	2023-09-25 19:27:20 +02:00
Carson Katri	6281d2066b	Add callbacks to `WuerstchenDecoderPipeline` and `WuerstchenCombinedPipeline` (#5154 )	2023-09-25 19:26:53 +02:00
Anh71me	28254c79b6	Fix type annotation (#5146 ) * Fix type annotation on Scheduler.from_pretrained * Fix type annotation on PIL.Image	2023-09-25 19:26:39 +02:00
MLRichter	0bc6be6960	Update wuerstchen.md (#5156 )	2023-09-25 18:43:08 +02:00
Patrick von Platen	144c3a8b7c	[Imports] Fix many import bugs and make sure that doc builder CI test works correctly (#5176 ) * [Doc builder] Ensure slow import for doc builder * Apply suggestions from code review * env for doc builder * fix more * [Diffusers] Set import to slow as env variable * fix docs * fix docs * Apply suggestions from code review * Apply suggestions from code review * fix docs * fix docs	2023-09-25 18:06:51 +02:00
Patrick von Platen	30a512ea69	[Core] Improve `.to(...)` method, fix offloads multi-gpu, add docstring, add dtype (#5132 ) * fix cpu offload * fix * fix * Update src/diffusers/pipelines/pipeline_utils.py * make style * Apply suggestions from code review Co-authored-by: YiYi Xu <yixu310@gmail.com> Co-authored-by: Pedro Cuenca <pedro@huggingface.co> * fix more * fix more --------- Co-authored-by: YiYi Xu <yixu310@gmail.com> Co-authored-by: Pedro Cuenca <pedro@huggingface.co>	2023-09-25 14:10:18 +02:00
Dhruv Nair	92f15f5bd4	Model CPU offload fix for BLIPDiffusion (#5174 ) cpu offload fix for blip diffusion	2023-09-25 17:07:32 +05:30
Patrick von Platen	22b19d578e	[Tests] Add is flaky decorator (#5139 ) * add is flaky decorator * fix more	2023-09-25 13:24:44 +02:00
Sayak Paul	787195fe20	Fix/controlnet lora (#5157 ) * print * print * print * print * print * debugging * debugging * debugging * debugging * safer condition. * remove prints and try excepts. * Empty-Commit * Apply suggestions from code review --------- Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>	2023-09-25 12:08:05 +02:00
Mishig	48664d62b8	Delete duplicatd doc file (#5169 )	2023-09-24 19:58:13 +02:00
YiYi Xu	5b11c5dc77	fix the add_noise function for dpm-multi et al (#5158 ) * remove to _device() for sigmas * update add_noise to use simgas --------- Co-authored-by: yiyixuxu <yixu310@gmail,com>	2023-09-23 09:07:50 -10:00
Sayak Paul	310cf32801	add: note on whom to tag for issues related to community pipelines. (#5083 )	2023-09-23 17:01:37 +01:00
Steven Liu	06b316ef5c	[docs] Improved image-to-image guide (#5020 ) * finish first draft * feedback * feedback	2023-09-22 13:20:30 -07:00
Pedro Cuenca	3651b14cf4	SDXL flax (#4254 ) * support transformer_layers_per block in flax UNet * add support for text_time additional embeddings to Flax UNet * rename attention layers for VAE * add shape asserts when renaming attention layers * transpose VAE attention layers * add pipeline flax SDXL code [WIP] * continue add pipeline flax SDXL code [WIP] * cleanup * Working on JIT support Fixed prompt embedding shapes so they work in parallel mode. Assuming we always have both text encoders for now, for simplicity. * Fixing embeddings (untested) * Remove spurious line * Shard guidance_scale when jitting. * Decode images * Fix sharding * style * Refiner UNet can be loaded. * Refiner / img2img pipeline * Allow latent outputs from base and latent inputs in refiner This makes it possible to chain base + refiner without having to use the vae decoder in the base model, the vae encoder in the refiner, skipping conversions to/from PIL, and avoiding TPU <-> CPU memory copies. * Adapt to FlaxCLIPTextModelOutput * Update Flax XL pipeline to FlaxCLIPTextModelOutput * make fix-copies * make style * add euler scheduler * Fix import * Fix copies, comment unused code. * Fix SDXL Flax imports * Fix euler discrete begin * improve init import * finish * put discrete euler in init * fix flax euler * Fix more * make style * correct init * correct init * Temporarily remove FlaxStableDiffusionXLImg2ImgPipeline * correct pipelines * finish --------- Co-authored-by: Martin Müller <martin.muller.me@gmail.com> Co-authored-by: patil-suraj <surajp815@gmail.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>	2023-09-22 18:34:04 +02:00
Pedro Cuenca	2e860e89eb	SDXL: update links to refine docs (#5101 ) * SDXL: update links to refine docs * make style	2023-09-22 13:17:17 +02:00
Younes Belkada	493f9529d7	[`PEFT` / `LoRA`] PEFT integration - text encoder (#5058 ) * more fixes * up * up * style * add in setup * oops * more changes * v1 rzfactor CI * Apply suggestions from code review Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * few todos * protect torch import * style * fix fuse text encoder * Update src/diffusers/loaders.py Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> * replace with `recurse_replace_peft_layers` * keep old modules for BC * adjustments on `adjust_lora_scale_text_encoder` * nit * move tests * add conversion utils * remove unneeded methods * use class method instead * oops * use `base_version` * fix examples * fix CI * fix weird error with python 3.8 * fix * better fix * style * Apply suggestions from code review Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * Apply suggestions from code review Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * add comment * Apply suggestions from code review Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> * conv2d support for recurse remove * added docstrings * more docstring * add deprecate * revert * try to fix merge conflicts * v1 tests * add new decorator * add saving utilities test * adapt tests a bit * add save / from_pretrained tests * add saving tests * add scale tests * fix deps tests * fix lora CI * fix tests * add comment * fix * style * add slow tests * slow tests pass * style * Update src/diffusers/utils/import_utils.py Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com> * Apply suggestions from code review Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com> * circumvents pattern finding issue * left a todo * Apply suggestions from code review Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> * update hub path * add lora workflow * fix --------- Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: Benjamin Bossan <BenjaminBossan@users.noreply.github.com>	2023-09-22 13:03:39 +02:00
hysts	b32555a2da	[docs] Add missing parenthesis in the sample code of BLIP Diffusion (#5144 ) Add missing parenthesis in the sample code of BLIP Diffusion	2023-09-22 09:38:17 +01:00