update

Remove unnecessary single file tests for SD Cascade UNet (#7996 )
2024-05-24 07:38:13 +00:00 · 2024-05-23 06:02:02 +00:00 · 2024-05-22 12:29:59 +05:30 · 2024-05-22 12:29:11 +05:30 · 2024-05-21 14:58:10 +05:30 · 2024-05-21 08:18:21 +05:30
98 changed files with 4624 additions and 570 deletions
@@ -39,7 +39,7 @@ jobs:
          python utils/print_env.py
      - name: Diffusers Benchmarking
        env:
-            HUGGING_FACE_HUB_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
+            HF_TOKEN: ${{ secrets.DIFFUSERS_BOT_TOKEN }}
            BASE_PATH: benchmark_outputs
        run: |
          export TOTAL_GPU_MEMORY=$(python -c "import torch; print(torch.cuda.get_device_properties(0).total_memory / (1024**3))")
@@ -90,24 +90,11 @@ jobs:

      - name: Post to a Slack channel
        id: slack
-        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          # Slack channel id, channel name, or user id to post message.
          # See also: https://api.slack.com/methods/chat.postMessage#channels
-          channel-id: ${{ env.CI_SLACK_CHANNEL }}
-          # For posting a rich message using Block Kit
-          payload: |
-            {
-              "text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.head_commit.url }}",
-              "blocks": [
-                {
-                  "type": "section",
-                  "text": {
-                    "type": "mrkdwn",
-                    "text": "${{ matrix.image-name }} Docker Image build result: ${{ job.status }}\n${{ github.event.head_commit.url }}"
-                  }
-                }
-              ]
-            }
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
+          title: "🤗 Results of the ${{ matrix.image-name }} Docker Image build"
+          status: ${{ job.status }}
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
@@ -81,7 +81,7 @@ jobs:

      - name: Nightly PyTorch CUDA checkpoint (pipelines) tests
        env:
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
@@ -141,7 +141,7 @@ jobs:
    - name: Run nightly PyTorch CUDA tests for non-pipeline modules
      if: ${{ matrix.module != 'examples'}}
      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
@@ -154,7 +154,7 @@ jobs:
    - name: Run nightly example tests with Torch
      if: ${{ matrix.module == 'examples' }}
      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
@@ -211,7 +211,7 @@ jobs:

    - name: Run nightly LoRA tests with PEFT and Torch
      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
@@ -269,7 +269,7 @@ jobs:

    - name: Run nightly Flax TPU tests
      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m pytest -n 0 \
          -s -v -k "Flax" \
@@ -324,7 +324,7 @@ jobs:

    - name: Run nightly ONNXRuntime CUDA tests
      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Onnx" \
@@ -390,7 +390,7 @@ jobs:
        shell: arch -arch arm64 bash {0}
        env:
          HF_HOME: /System/Volumes/Data/mnt/cache
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
        run: |
          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
            --report-log=tests_torch_mps.log \
@@ -156,7 +156,7 @@ jobs:
      if: ${{ matrix.config.framework == 'pytorch_examples' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install peft
+        python -m uv pip install peft timm
        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          --make-reports=tests_${{ matrix.config.report }} \
          examples
@@ -87,7 +87,7 @@ jobs:
          python utils/print_env.py
      - name: Slow PyTorch CUDA checkpoint tests on Ubuntu
        env:
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
@@ -144,7 +144,7 @@ jobs:

    - name: Run slow PyTorch CUDA tests
      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
@@ -194,7 +194,7 @@ jobs:

    - name: Run slow PEFT CUDA tests
      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
@@ -243,7 +243,7 @@ jobs:

    - name: Run slow Flax TPU tests
      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m pytest -n 0 \
          -s -v -k "Flax" \
@@ -290,7 +290,7 @@ jobs:

    - name: Run slow ONNXRuntime CUDA tests
      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Onnx" \
@@ -337,7 +337,7 @@ jobs:
        python utils/print_env.py
    - name: Run example tests on GPU
      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
    - name: Failure short reports
@@ -378,7 +378,7 @@ jobs:
        python utils/print_env.py
    - name: Run example tests on GPU
      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
    - name: Failure short reports
@@ -423,9 +423,10 @@ jobs:

    - name: Run example tests on GPU
      env:
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install timm
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/

    - name: Failure short reports
@@ -107,7 +107,7 @@ jobs:
      if: ${{ matrix.config.framework == 'pytorch_examples' }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install peft
+        python -m uv pip install peft timm
        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
          --make-reports=tests_${{ matrix.config.report }} \
          examples
@@ -23,7 +23,7 @@ concurrency:
 jobs:
  run_fast_tests_apple_m1:
    name: Fast PyTorch MPS tests on MacOS
-    runs-on: [ self-hosted, apple-m1 ]
+    runs-on: macos-13-xlarge

    steps:
    - name: Checkout diffusers
@@ -59,7 +59,7 @@ jobs:
      shell: arch -arch arm64 bash {0}
      env:
        HF_HOME: /System/Volumes/Data/mnt/cache
-        HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        ${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/

@@ -25,6 +25,6 @@ jobs:

      - name: Update metadata
        env:
-          HUGGING_FACE_HUB_TOKEN: ${{ secrets.SAYAK_HF_TOKEN }}
+          HF_TOKEN: ${{ secrets.SAYAK_HF_TOKEN }}
        run: |
          python utils/update_metadata.py --commit_sha ${{ github.sha }}
@@ -355,7 +355,7 @@ You will need basic `git` proficiency to be able to contribute to
 manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
 Git](https://git-scm.com/book/en/v2) is a very good reference.

-Follow these steps to start contributing ([supported Python versions](https://github.com/huggingface/diffusers/blob/main/setup.py#L265)):
+Follow these steps to start contributing ([supported Python versions](https://github.com/huggingface/diffusers/blob/42f25d601a910dceadaee6c44345896b4cfa9928/setup.py#L270)):

 1. Fork the [repository](https://github.com/huggingface/diffusers) by
 clicking on the 'Fork' button on the repository's page. This creates a copy of the code
@@ -305,6 +305,8 @@
      title: Personalized Image Animator (PIA)
    - local: api/pipelines/pixart
      title: PixArt-α
+    - local: api/pipelines/pixart_sigma
+      title: PixArt-Σ
    - local: api/pipelines/self_attention_guidance
      title: Self-Attention Guidance
    - local: api/pipelines/semantic_stable_diffusion
@@ -31,7 +31,7 @@ Some notes about this pipeline:

 <Tip>

-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-components-across-pipelines) section to learn how to efficiently load the same components into multiple pipelines.
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.

 </Tip>

@@ -0,0 +1,151 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# PixArt-Σ
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/pixart/header_collage_sigma.jpg)
+
+[PixArt-Σ: Weak-to-Strong Training of Diffusion Transformer for 4K Text-to-Image Generation](https://huggingface.co/papers/2403.04692) is Junsong Chen, Jincheng Yu, Chongjian Ge, Lewei Yao, Enze Xie, Yue Wu, Zhongdao Wang, James Kwok, Ping Luo, Huchuan Lu, and Zhenguo Li.
+
+The abstract from the paper is:
+
+*In this paper, we introduce PixArt-Σ, a Diffusion Transformer model (DiT) capable of directly generating images at 4K resolution. PixArt-Σ represents a significant advancement over its predecessor, PixArt-α, offering images of markedly higher fidelity and improved alignment with text prompts. A key feature of PixArt-Σ is its training efficiency. Leveraging the foundational pre-training of PixArt-α, it evolves from the ‘weaker’ baseline to a ‘stronger’ model via incorporating higher quality data, a process we term “weak-to-strong training”. The advancements in PixArt-Σ are twofold: (1) High-Quality Training Data: PixArt-Σ incorporates superior-quality image data, paired with more precise and detailed image captions. (2) Efficient Token Compression: we propose a novel attention module within the DiT framework that compresses both keys and values, significantly improving efficiency and facilitating ultra-high-resolution image generation. Thanks to these improvements, PixArt-Σ achieves superior image quality and user prompt adherence capabilities with significantly smaller model size (0.6B parameters) than existing text-to-image diffusion models, such as SDXL (2.6B parameters) and SD Cascade (5.1B parameters). Moreover, PixArt-Σ’s capability to generate 4K images supports the creation of high-resolution posters and wallpapers, efficiently bolstering the production of highquality visual content in industries such as film and gaming.*
+
+You can find the original codebase at [PixArt-alpha/PixArt-sigma](https://github.com/PixArt-alpha/PixArt-sigma) and all the available checkpoints at [PixArt-alpha](https://huggingface.co/PixArt-alpha).
+
+Some notes about this pipeline:
+
+* It uses a Transformer backbone (instead of a UNet) for denoising. As such it has a similar architecture as [DiT](https://hf.co/docs/transformers/model_doc/dit).
+* It was trained using text conditions computed from T5. This aspect makes the pipeline better at following complex text prompts with intricate details.
+* It is good at producing high-resolution images at different aspect ratios. To get the best results, the authors recommend some size brackets which can be found [here](https://github.com/PixArt-alpha/PixArt-sigma/blob/master/diffusion/data/datasets/utils.py).
+* It rivals the quality of state-of-the-art text-to-image generation systems (as of this writing) such as PixArt-α, Stable Diffusion XL, Playground V2.0 and DALL-E 3, while being more efficient than them.
+* It shows the ability of generating super high resolution images, such as 2048px or even 4K.
+* It shows that text-to-image models can grow from a weak model to a stronger one through several improvements (VAEs, datasets, and so on.)
+
+<Tip>
+
+Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
+
+</Tip>
+
+## Inference with under 8GB GPU VRAM
+
+Run the [`PixArtSigmaPipeline`] with under 8GB GPU VRAM by loading the text encoder in 8-bit precision. Let's walk through a full-fledged example. 
+
+First, install the [bitsandbytes](https://github.com/TimDettmers/bitsandbytes) library:
+
+```bash
+pip install -U bitsandbytes
+```
+
+Then load the text encoder in 8-bit:
+
+```python
+from transformers import T5EncoderModel
+from diffusers import PixArtSigmaPipeline
+import torch
+
+text_encoder = T5EncoderModel.from_pretrained(
+    "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
+    subfolder="text_encoder",
+    load_in_8bit=True,
+    device_map="auto",
+
+)
+pipe = PixArtSigmaPipeline.from_pretrained(
+    "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
+    text_encoder=text_encoder,
+    transformer=None,
+    device_map="balanced"
+)
+```
+
+Now, use the `pipe` to encode a prompt:
+
+```python
+with torch.no_grad():
+    prompt = "cute cat"
+    prompt_embeds, prompt_attention_mask, negative_embeds, negative_prompt_attention_mask = pipe.encode_prompt(prompt)
+```
+
+Since text embeddings have been computed, remove the `text_encoder` and `pipe` from the memory, and free up som GPU VRAM:
+
+```python
+import gc 
+
+def flush():
+    gc.collect()
+    torch.cuda.empty_cache()
+
+del text_encoder
+del pipe
+flush()
+```
+
+Then compute the latents with the prompt embeddings as inputs:
+
+```python
+pipe = PixArtSigmaPipeline.from_pretrained(
+    "PixArt-alpha/PixArt-Sigma-XL-2-1024-MS",
+    text_encoder=None,
+    torch_dtype=torch.float16,
+).to("cuda")
+
+latents = pipe(
+    negative_prompt=None, 
+    prompt_embeds=prompt_embeds,
+    negative_prompt_embeds=negative_embeds,
+    prompt_attention_mask=prompt_attention_mask,
+    negative_prompt_attention_mask=negative_prompt_attention_mask,
+    num_images_per_prompt=1,
+    output_type="latent",
+).images
+
+del pipe.transformer
+flush()
+```
+
+<Tip>
+
+Notice that while initializing `pipe`, you're setting `text_encoder` to `None` so that it's not loaded.
+
+</Tip>
+
+Once the latents are computed, pass it off to the VAE to decode into a real image:
+
+```python
+with torch.no_grad():
+    image = pipe.vae.decode(latents / pipe.vae.config.scaling_factor, return_dict=False)[0]
+image = pipe.image_processor.postprocess(image, output_type="pil")[0]
+image.save("cat.png")
+```
+
+By deleting components you aren't using and flushing the GPU VRAM, you should be able to run [`PixArtSigmaPipeline`] with under 8GB GPU VRAM.
+
+![](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/pixart/8bits_cat.png)
+
+If you want a report of your memory-usage, run this [script](https://gist.github.com/sayakpaul/3ae0f847001d342af27018a96f467e4e).
+
+<Tip warning={true}>
+
+Text embeddings computed in 8-bit can impact the quality of the generated images because of the information loss in the representation space caused by the reduced precision. It's recommended to compare the outputs with and without 8-bit.
+
+</Tip>
+
+While loading the `text_encoder`, you set `load_in_8bit` to `True`. You could also specify `load_in_4bit` to bring your memory requirements down even further to under 7GB.
+
+## PixArtSigmaPipeline
+
+[[autodoc]] PixArtSigmaPipeline
+	- all
+	- __call__
+	
@@ -12,4 +12,10 @@ specific language governing permissions and limitations under the License.

 # Video Processor

-The `VideoProcessor` provides a unified API for video pipelines to prepare inputs for VAE encoding and post-processing outputs once they're decoded. The class inherits [`VaeImageProcessor`] so it includes transformations such as resizing, normalization, and conversion between PIL Image, PyTorch, and NumPy arrays.
+The [`VideoProcessor`] provides a unified API for video pipelines to prepare inputs for VAE encoding and post-processing outputs once they're decoded. The class inherits [`VaeImageProcessor`] so it includes transformations such as resizing, normalization, and conversion between PIL Image, PyTorch, and NumPy arrays.
+
+## VideoProcessor
+
+[[autodoc]] video_processor.VideoProcessor.preprocess_video
+
+[[autodoc]] video_processor.VideoProcessor.postprocess_video
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.

 # Speed up inference

-There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attetntion](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times.
+There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times.

 > [!TIP]
 > Optimizing for inference speed or reduced memory usage can lead to improved performance in the other category, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about lowering memory usage in the [Reduce memory usage](memory) guide.
@@ -19,13 +19,74 @@ The denoising loop of a pipeline can be modified with custom defined functions u

 This guide will demonstrate how callbacks work by a few features you can implement with them.

+## Official callbacks
+
+We provide a list of callbacks you can plug into an existing pipeline and modify the denoising loop. This is the current list of official callbacks:
+
+- `SDCFGCutoffCallback`: Disables the CFG after a certain number of steps for all SD 1.5 pipelines, including text-to-image, image-to-image, inpaint, and controlnet.
+- `SDXLCFGCutoffCallback`: Disables the CFG after a certain number of steps for all SDXL pipelines, including text-to-image, image-to-image, inpaint, and controlnet.
+- `IPAdapterScaleCutoffCallback`: Disables the IP Adapter after a certain number of steps for all pipelines supporting IP-Adapter.
+
+> [!TIP]
+> If you want to add a new official callback, feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) or [submit a PR](https://huggingface.co/docs/diffusers/main/en/conceptual/contribution#how-to-open-a-pr).
+
+To set up a callback, you need to specify the number of denoising steps after which the callback comes into effect. You can do so by using either one of these two arguments
+
+- `cutoff_step_ratio`: Float number with the ratio of the steps.
+- `cutoff_step_index`: Integer number with the exact number of the step.
+
+```python
+import torch
+
+from diffusers import DPMSolverMultistepScheduler, StableDiffusionXLPipeline
+from diffusers.callbacks import SDXLCFGCutoffCallback
+
+
+callback = SDXLCFGCutoffCallback(cutoff_step_ratio=0.4)
+# can also be used with cutoff_step_index
+# callback = SDXLCFGCutoffCallback(cutoff_step_ratio=None, cutoff_step_index=10)
+
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to("cuda")
+pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, use_karras_sigmas=True)
+
+prompt = "a sports car at the road, best quality, high quality, high detail, 8k resolution"
+
+generator = torch.Generator(device="cpu").manual_seed(2628670641)
+
+out = pipeline(
+    prompt=prompt,
+    negative_prompt="",
+    guidance_scale=6.5,
+    num_inference_steps=25,
+    generator=generator,
+    callback_on_step_end=callback,
+)
+
+out.images[0].save("official_callback.png")
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/without_cfg_callback.png" alt="generated image of a sports car at the road" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">without SDXLCFGCutoffCallback</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/with_cfg_callback.png" alt="generated image of a a sports car at the road with cfg callback" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">with SDXLCFGCutoffCallback</figcaption>
+  </div>
+</div>
+
 ## Dynamic classifier-free guidance

 Dynamic classifier-free guidance (CFG) is a feature that allows you to disable CFG after a certain number of inference steps which can help you save compute with minimal cost to performance. The callback function for this should have the following arguments:

-* `pipeline` (or the pipeline instance) provides access to important properties such as `num_timesteps` and `guidance_scale`. You can modify these properties by updating the underlying attributes. For this example, you'll disable CFG by setting `pipeline._guidance_scale=0.0`.
-* `step_index` and `timestep` tell you where you are in the denoising loop. Use `step_index` to turn off CFG after reaching 40% of `num_timesteps`.
-* `callback_kwargs` is a dict that contains tensor variables you can modify during the denoising loop. It only includes variables specified in the `callback_on_step_end_tensor_inputs` argument, which is passed to the pipeline's `__call__` method. Different pipelines may use different sets of variables, so please check a pipeline's `_callback_tensor_inputs` attribute for the list of variables you can modify. Some common variables include `latents` and `prompt_embeds`. For this function, change the batch size of `prompt_embeds` after setting `guidance_scale=0.0` in order for it to work properly.
+- `pipeline` (or the pipeline instance) provides access to important properties such as `num_timesteps` and `guidance_scale`. You can modify these properties by updating the underlying attributes. For this example, you'll disable CFG by setting `pipeline._guidance_scale=0.0`.
+- `step_index` and `timestep` tell you where you are in the denoising loop. Use `step_index` to turn off CFG after reaching 40% of `num_timesteps`.
+- `callback_kwargs` is a dict that contains tensor variables you can modify during the denoising loop. It only includes variables specified in the `callback_on_step_end_tensor_inputs` argument, which is passed to the pipeline's `__call__` method. Different pipelines may use different sets of variables, so please check a pipeline's `_callback_tensor_inputs` attribute for the list of variables you can modify. Some common variables include `latents` and `prompt_embeds`. For this function, change the batch size of `prompt_embeds` after setting `guidance_scale=0.0` in order for it to work properly.

 Your callback function should look something like this:

@@ -981,7 +981,7 @@ def collate_fn(examples, with_prior_preservation=False):


 class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""

    def __init__(self, prompt, num_samples):
        self.prompt = prompt
@@ -1136,7 +1136,7 @@ def collate_fn(examples, with_prior_preservation=False):


 class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""

    def __init__(self, prompt, num_samples):
        self.prompt = prompt
@@ -68,6 +68,7 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif
 |   InstantID Pipeline                                                                                               | Stable Diffusion XL Pipeline that supports InstantID                                                                                                                                                                                                                                                                                                                                                 |  [InstantID Pipeline](#instantid-pipeline) | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/InstantX/InstantID) | [Haofan Wang](https://github.com/haofanwang) |
 |   UFOGen Scheduler                                                                                               | Scheduler for UFOGen Model (compatible with Stable Diffusion pipelines)                                                                                                                                                                                                                                                                                                                                                 |  [UFOGen Scheduler](#ufogen-scheduler) | - | [dg845](https://github.com/dg845) |
 | Stable Diffusion XL IPEX Pipeline | Accelerate Stable Diffusion XL inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [Stable Diffusion XL on IPEX](#stable-diffusion-xl-on-ipex) | - | [Dan Li](https://github.com/ustcuna/) |
+| Stable Diffusion BoxDiff Pipeline | Training-free controlled generation with bounding boxes using [BoxDiff](https://github.com/showlab/BoxDiff) | [Stable Diffusion BoxDiff Pipeline](#stable-diffusion-boxdiff) | - | [Jingyang Zhang](https://github.com/zjysteven/) |

 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.

@@ -1676,6 +1677,68 @@ image = pipe(prompt, image=input_image, strength=0.75,).images[0]
 image.save('tensorrt_img2img_new_zealand_hills.png')
 ```

+### Stable Diffusion BoxDiff
+BoxDiff is a training-free method for controlled generation with bounding box coordinates. It shoud work with any Stable Diffusion model. Below shows an example with `stable-diffusion-2-1-base`.
+```py
+import torch
+from PIL import Image, ImageDraw
+from copy import deepcopy
+
+from examples.community.pipeline_stable_diffusion_boxdiff import StableDiffusionBoxDiffPipeline
+
+def draw_box_with_text(img, boxes, names):
+    colors = ["red", "olive", "blue", "green", "orange", "brown", "cyan", "purple"]
+    img_new = deepcopy(img)
+    draw = ImageDraw.Draw(img_new)
+
+    W, H = img.size
+    for bid, box in enumerate(boxes):
+        draw.rectangle([box[0] * W, box[1] * H, box[2] * W, box[3] * H], outline=colors[bid % len(colors)], width=4)
+        draw.text((box[0] * W, box[1] * H), names[bid], fill=colors[bid % len(colors)])
+    return img_new
+
+pipe = StableDiffusionBoxDiffPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-2-1-base",
+    torch_dtype=torch.float16,
+)
+pipe.to("cuda")
+
+# example 1
+prompt = "as the aurora lights up the sky, a herd of reindeer leisurely wanders on the grassy meadow, admiring the breathtaking view, a serene lake quietly reflects the magnificent display, and in the distance, a snow-capped mountain stands majestically, fantasy, 8k, highly detailed"
+phrases = [
+    "aurora",
+    "reindeer",
+    "meadow",
+    "lake",
+    "mountain"
+]
+boxes = [[1,3,512,202], [75,344,421,495], [1,327,508,507], [2,217,507,341], [1,135,509,242]]
+
+# example 2
+# prompt = "A rabbit wearing sunglasses looks very proud"
+# phrases = ["rabbit", "sunglasses"]
+# boxes = [[67,87,366,512], [66,130,364,262]]
+
+boxes = [[x / 512 for x in box] for box in boxes]
+
+images = pipe(
+    prompt,
+    boxdiff_phrases=phrases,
+    boxdiff_boxes=boxes,
+    boxdiff_kwargs={
+        "attention_res": 16,
+        "normalize_eot": True
+    },
+    num_inference_steps=50,
+    guidance_scale=7.5,
+    generator=torch.manual_seed(42),
+    safety_checker=None
+).images
+
+draw_box_with_text(images[0], boxes, phrases).save("output.png")
+```
+
+
 ### Stable Diffusion Reference

 This pipeline uses the Reference Control. Refer to the [sd-webui-controlnet discussion: Reference-only Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1236)[sd-webui-controlnet discussion: Reference-adain Control](https://github.com/Mikubill/sd-webui-controlnet/discussions/1280).
@@ -460,7 +460,7 @@ class StableDiffusionUpscaleLDM3DPipeline(
            )

        # verify batch size of prompt and image are same if image is a list or tensor or numpy array
-        if isinstance(image, list) or isinstance(image, torch.Tensor) or isinstance(image, np.ndarray):
+        if isinstance(image, (list, np.ndarray, torch.Tensor)):
            if prompt is not None and isinstance(prompt, str):
                batch_size = 1
            elif prompt is not None and isinstance(prompt, list):
@@ -78,7 +78,7 @@ def torch_dfs(model: torch.nn.Module):
 class StableDiffusionReferencePipeline(
    DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, IPAdapterMixin, FromSingleFileMixin
 ):
-    r""" "
+    r"""
    Pipeline for Stable Diffusion Reference.

    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
@@ -1358,7 +1358,7 @@ def main(args):
                # estimates to predict the data point in the augmented PF-ODE trajectory corresponding to the next ODE
                # solver timestep.
                with torch.no_grad():
-                    if torch.backends.mps.is_available() or "playground" in args.pretrained_model_name_or_path:
+                    if torch.backends.mps.is_available() or "playground" in args.pretrained_teacher_model:
                        autocast_ctx = nullcontext()
                    else:
                        autocast_ctx = torch.autocast(accelerator.device.type)
@@ -152,7 +152,7 @@ def collate_fn(examples, with_prior_preservation):


 class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""

    def __init__(self, prompt, num_samples):
        self.prompt = prompt
@@ -742,7 +742,7 @@ def collate_fn(examples, with_prior_preservation=False):


 class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""

    def __init__(self, prompt, num_samples):
        self.prompt = prompt
@@ -759,7 +759,7 @@ class PromptDataset(Dataset):


 def model_has_vae(args):
-    config_file_name = os.path.join("vae", AutoencoderKL.config_name)
+    config_file_name = Path("vae", AutoencoderKL.config_name).as_posix()
    if os.path.isdir(args.pretrained_model_name_or_path):
        config_file_name = os.path.join(args.pretrained_model_name_or_path, config_file_name)
        return os.path.isfile(config_file_name)
@@ -301,7 +301,7 @@ class DreamBoothDataset(Dataset):


 class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""

    def __init__(self, prompt, num_samples):
        self.prompt = prompt
@@ -680,7 +680,7 @@ def collate_fn(examples, with_prior_preservation=False):


 class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""

    def __init__(self, prompt, num_samples):
        self.prompt = prompt
@@ -903,7 +903,7 @@ def collate_fn(examples, with_prior_preservation=False):


 class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""

    def __init__(self, prompt, num_samples):
        self.prompt = prompt
@@ -327,7 +327,7 @@ class DreamBoothDataset(Dataset):


 class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""

    def __init__(self, prompt, num_samples):
        self.prompt = prompt
@@ -385,7 +385,7 @@ class DreamBoothDataset(Dataset):


 class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""

    def __init__(self, prompt, num_samples):
        self.prompt = prompt
@@ -384,7 +384,7 @@ class DreamBoothDataset(Dataset):


 class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""

    def __init__(self, prompt, num_samples):
        self.prompt = prompt
@@ -1,6 +1,6 @@
 diffusers==0.20.1
 accelerate==0.23.0
-transformers==4.36.0
+transformers==4.38.0
 peft==0.5.0
 torch==2.0.1
 torchvision>=0.16
@@ -762,7 +762,7 @@ def collate_fn(examples, with_prior_preservation=False):


 class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""

    def __init__(self, prompt, num_samples):
        self.prompt = prompt
@@ -700,7 +700,7 @@ def collate_fn(examples, with_prior_preservation=False):


 class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""

    def __init__(self, prompt, num_samples):
        self.prompt = prompt
@@ -922,7 +922,7 @@ def collate_fn(examples, with_prior_preservation=False):


 class PromptDataset(Dataset):
-    "A simple dataset to prepare the prompts to generate class images on multiple GPUs."
+    """A simple dataset to prepare the prompts to generate class images on multiple GPUs."""

    def __init__(self, prompt, num_samples):
        self.prompt = prompt
@@ -1,8 +1,8 @@
 accelerate>=0.16.0
 torchvision
 transformers>=4.25.1
-datasets
+datasets>=2.19.1
 ftfy
 tensorboard
 Jinja2
-peft==0.7.0
+peft==0.7.0
@@ -50,7 +50,7 @@ from diffusers.optimization import get_scheduler
 from diffusers.training_utils import EMAModel, compute_snr
 from diffusers.utils import check_min_version, is_wandb_available
 from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
-from diffusers.utils.import_utils import is_xformers_available
+from diffusers.utils.import_utils import is_torch_npu_available, is_xformers_available
 from diffusers.utils.torch_utils import is_compiled_module


@@ -58,7 +58,8 @@ from diffusers.utils.torch_utils import is_compiled_module
 check_min_version("0.28.0.dev0")

 logger = get_logger(__name__)
-
+if is_torch_npu_available():
+    torch.npu.config.allow_internal_format = False

 DATASET_NAME_MAPPING = {
    "lambdalabs/naruto-blip-captions": ("image", "text"),
@@ -460,6 +461,9 @@ def parse_args(input_args=None):
        ),
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
+    parser.add_argument(
+        "--enable_npu_flash_attention", action="store_true", help="Whether or not to use npu flash attention."
+    )
    parser.add_argument(
        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
    )
@@ -716,7 +720,12 @@ def main(args):
            args.pretrained_model_name_or_path, subfolder="unet", revision=args.revision, variant=args.variant
        )
        ema_unet = EMAModel(ema_unet.parameters(), model_cls=UNet2DConditionModel, model_config=ema_unet.config)
-
+    if args.enable_npu_flash_attention:
+        if is_torch_npu_available():
+            logger.info("npu flash attention enabled.")
+            unet.enable_npu_flash_attention()
+        else:
+            raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu devices.")
    if args.enable_xformers_memory_efficient_attention:
        if is_xformers_available():
            import xformers
@@ -0,0 +1,127 @@
+## Training an VQGAN VAE
+VQVAEs were first introduced in [Neural Discrete Representation Learning](https://arxiv.org/abs/1711.00937) and was combined with a GAN in the paper [Taming Transformers for High-Resolution Image Synthesis](https://arxiv.org/abs/2012.09841). The basic idea of a VQVAE is it's a type of a variational auto encoder with tokens as the latent space similar to tokens for LLMs. This script was adapted from a [pr to huggingface's open-muse project](https://github.com/huggingface/open-muse/pull/52) with general code following [lucidrian's implementation of the vqgan training script](https://github.com/lucidrains/muse-maskgit-pytorch/blob/main/muse_maskgit_pytorch/trainers.py) but both of these implementation follow from the [taming transformer repo](https://github.com/CompVis/taming-transformers?tab=readme-ov-file).
+
+
+Creating a training image set is [described in a different document](https://huggingface.co/docs/datasets/image_process#image-datasets).
+
+### Installing the dependencies
+
+Before running the scripts, make sure to install the library's training dependencies:
+
+**Important**
+
+To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
+```bash
+git clone https://github.com/huggingface/diffusers
+cd diffusers
+pip install .
+```
+
+Then cd in the example folder  and run
+```bash
+pip install -r requirements.txt
+```
+
+
+And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
+
+```bash
+accelerate config
+```
+
+### Training on CIFAR10
+
+The command to train a VQGAN model on cifar10 dataset:
+
+```bash
+accelerate launch train_vqgan.py \
+  --dataset_name=cifar10 \
+  --image_column=img \
+  --validation_images images/bird.jpg images/car.jpg images/dog.jpg images/frog.jpg \
+  --resolution=128 \
+  --train_batch_size=2 \
+  --gradient_accumulation_steps=8 \
+  --report_to=wandb
+```
+
+An example training run is [here](https://wandb.ai/sayakpaul/vqgan-training/runs/0m5kzdfp) by @sayakpaul and a lower scale one [here](https://wandb.ai/dsbuddy27/vqgan-training/runs/eqd6xi4n?nw=nwuserisamu). The validation images can be obtained from [here](https://huggingface.co/datasets/diffusers/docs-images/tree/main/vqgan_validation_images).
+The simplest way to improve the quality of a VQGAN model is to maximize the amount of information present in the bottleneck. The easiest way to do this is increasing the image resolution. However, other ways include, but not limited to, lowering compression by downsampling fewer times or increasing the vocaburary size which at most can be around 16384. How to do this is shown below.
+
+# Modifying the architecture
+
+To modify the architecture of the vqgan model you can save the config taken from [here](https://huggingface.co/kandinsky-community/kandinsky-2-2-decoder/blob/main/movq/config.json) and then provide that to the script with the option --model_config_name_or_path. This config is below
+```
+{
+  "_class_name": "VQModel",
+  "_diffusers_version": "0.17.0.dev0",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    256,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "AttnDownEncoderBlock2D"
+  ],
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "norm_type": "spatial",
+  "num_vq_embeddings": 16384,
+  "out_channels": 3,
+  "sample_size": 32,
+  "scaling_factor": 0.18215,
+  "up_block_types": [
+    "AttnUpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ],
+  "vq_embed_dim": 4
+}
+```
+To lower the amount of layers in a VQGan, you can remove layers by modifying the block_out_channels, down_block_types, and up_block_types like below
+```
+{
+  "_class_name": "VQModel",
+  "_diffusers_version": "0.17.0.dev0",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    256,
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+  ],
+  "in_channels": 3,
+  "latent_channels": 4,
+  "layers_per_block": 2,
+  "norm_num_groups": 32,
+  "norm_type": "spatial",
+  "num_vq_embeddings": 16384,
+  "out_channels": 3,
+  "sample_size": 32,
+  "scaling_factor": 0.18215,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ],
+  "vq_embed_dim": 4
+}
+```
+For increasing the size of the vocaburaries you can increase num_vq_embeddings. However, [some research](https://magvit.cs.cmu.edu/v2/) shows that the representation of VQGANs start degrading after 2^14~16384 vq embeddings so it's not recommended to go past that.
+
+## Extra training tips/ideas
+During logging take care to make sure data_time is low. data_time is the amount spent loading the data and where the GPU is not active. So essentially, it's the time wasted. The easiest way to lower data time is to increase the --dataloader_num_workers to a higher number like 4. Due to a bug in Pytorch, this only works on linux based systems. For more details check [here](https://github.com/huggingface/diffusers/issues/7646)
+Secondly, training should seem to be done when both the discriminator and the generator loss converges.
+Thirdly, another low hanging fruit is just using ema using the --use_ema parameter. This tends to make the output images smoother. This has a con where you have to lower your batch size by 1 but it may be worth it.
+Another more experimental low hanging fruit is changing from the vgg19 to different models for the lpips loss using the --timm_model_backend. If you do this, I recommend also changing the timm_model_layers parameter to the layer in your model which you think is best for representation. However, becareful with the feature map norms since this can easily overdominate the loss.
@@ -0,0 +1,48 @@
+"""
+Ported from Paella
+"""
+
+import torch
+from torch import nn
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.modeling_utils import ModelMixin
+
+
+# Discriminator model ported from Paella https://github.com/dome272/Paella/blob/main/src_distributed/vqgan.py
+class Discriminator(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(self, in_channels=3, cond_channels=0, hidden_channels=512, depth=6):
+        super().__init__()
+        d = max(depth - 3, 3)
+        layers = [
+            nn.utils.spectral_norm(
+                nn.Conv2d(in_channels, hidden_channels // (2**d), kernel_size=3, stride=2, padding=1)
+            ),
+            nn.LeakyReLU(0.2),
+        ]
+        for i in range(depth - 1):
+            c_in = hidden_channels // (2 ** max((d - i), 0))
+            c_out = hidden_channels // (2 ** max((d - 1 - i), 0))
+            layers.append(nn.utils.spectral_norm(nn.Conv2d(c_in, c_out, kernel_size=3, stride=2, padding=1)))
+            layers.append(nn.InstanceNorm2d(c_out))
+            layers.append(nn.LeakyReLU(0.2))
+        self.encoder = nn.Sequential(*layers)
+        self.shuffle = nn.Conv2d(
+            (hidden_channels + cond_channels) if cond_channels > 0 else hidden_channels, 1, kernel_size=1
+        )
+        self.logits = nn.Sigmoid()
+
+    def forward(self, x, cond=None):
+        x = self.encoder(x)
+        if cond is not None:
+            cond = cond.view(
+                cond.size(0),
+                cond.size(1),
+                1,
+                1,
+            ).expand(-1, -1, x.size(-2), x.size(-1))
+            x = torch.cat([x, cond], dim=1)
+        x = self.shuffle(x)
+        x = self.logits(x)
+        return x
@@ -0,0 +1,8 @@
+accelerate>=0.16.0
+torchvision
+transformers>=4.25.1
+datasets
+timm
+numpy
+tqdm
+tensorboard
@@ -0,0 +1,395 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import logging
+import os
+import shutil
+import sys
+import tempfile
+
+import torch
+
+from diffusers import VQModel
+from diffusers.utils.testing_utils import require_timm
+
+
+sys.path.append("..")
+from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
+
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+stream_handler = logging.StreamHandler(sys.stdout)
+logger.addHandler(stream_handler)
+
+
+@require_timm
+class TextToImage(ExamplesTestsAccelerate):
+    @property
+    def test_vqmodel_config(self):
+        return {
+            "_class_name": "VQModel",
+            "_diffusers_version": "0.17.0.dev0",
+            "act_fn": "silu",
+            "block_out_channels": [
+                32,
+            ],
+            "down_block_types": [
+                "DownEncoderBlock2D",
+            ],
+            "in_channels": 3,
+            "latent_channels": 4,
+            "layers_per_block": 2,
+            "norm_num_groups": 32,
+            "norm_type": "spatial",
+            "num_vq_embeddings": 32,
+            "out_channels": 3,
+            "sample_size": 32,
+            "scaling_factor": 0.18215,
+            "up_block_types": [
+                "UpDecoderBlock2D",
+            ],
+            "vq_embed_dim": 4,
+        }
+
+    @property
+    def test_discriminator_config(self):
+        return {
+            "_class_name": "Discriminator",
+            "_diffusers_version": "0.27.0.dev0",
+            "in_channels": 3,
+            "cond_channels": 0,
+            "hidden_channels": 8,
+            "depth": 4,
+        }
+
+    def get_vq_and_discriminator_configs(self, tmpdir):
+        vqmodel_config_path = os.path.join(tmpdir, "vqmodel.json")
+        discriminator_config_path = os.path.join(tmpdir, "discriminator.json")
+        with open(vqmodel_config_path, "w") as fp:
+            json.dump(self.test_vqmodel_config, fp)
+        with open(discriminator_config_path, "w") as fp:
+            json.dump(self.test_discriminator_config, fp)
+        return vqmodel_config_path, discriminator_config_path
+
+    def test_vqmodel(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir)
+            test_args = f"""
+                examples/vqgan/train_vqgan.py
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 32
+                --image_column image
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 2
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --model_config_name_or_path {vqmodel_config_path}
+                --discriminator_config_name_or_path {discriminator_config_path}
+                --output_dir {tmpdir}
+                """.split()
+
+            run_command(self._launch_args + test_args)
+            # save_pretrained smoke test
+            self.assertTrue(
+                os.path.isfile(os.path.join(tmpdir, "discriminator", "diffusion_pytorch_model.safetensors"))
+            )
+            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "vqmodel", "diffusion_pytorch_model.safetensors")))
+
+    def test_vqmodel_checkpointing(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir)
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                examples/vqgan/train_vqgan.py
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 32
+                --image_column image
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --model_config_name_or_path {vqmodel_config_path}
+                --discriminator_config_name_or_path {discriminator_config_path}
+                --checkpointing_steps=2
+                --output_dir {tmpdir}
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            # check can run an intermediate checkpoint
+            model = VQModel.from_pretrained(tmpdir, subfolder="checkpoint-2/vqmodel")
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            _ = model(image)
+
+            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
+            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4"},
+            )
+
+            # Run training script for 2 total steps resuming from checkpoint 4
+
+            resume_run_args = f"""
+                examples/vqgan/train_vqgan.py
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 32
+                --image_column image
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --model_config_name_or_path {vqmodel_config_path}
+                --discriminator_config_name_or_path {discriminator_config_path}
+                --checkpointing_steps=1
+                --resume_from_checkpoint={os.path.join(tmpdir, 'checkpoint-4')}
+                --output_dir {tmpdir}
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            # check can run new fully trained pipeline
+            model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            _ = model(image)
+
+            # no checkpoint-2 -> check old checkpoints do not exist
+            # check new checkpoints exist
+            # In the current script, checkpointing_steps 1 is equivalent to checkpointing_steps 2 as after the generator gets trained for one step,
+            # the discriminator gets trained and loss and saving happens after that. Thus we do not expect to get a checkpoint-5
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_vqmodel_checkpointing_use_ema(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir)
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                examples/vqgan/train_vqgan.py
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 32
+                --image_column image
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --model_config_name_or_path {vqmodel_config_path}
+                --discriminator_config_name_or_path {discriminator_config_path}
+                --checkpointing_steps=2
+                --output_dir {tmpdir}
+                --use_ema
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            _ = model(image)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            # check can run an intermediate checkpoint
+            model = VQModel.from_pretrained(tmpdir, subfolder="checkpoint-2/vqmodel")
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            _ = model(image)
+
+            # Remove checkpoint 2 so that we can check only later checkpoints exist after resuming
+            shutil.rmtree(os.path.join(tmpdir, "checkpoint-2"))
+
+            # Run training script for 2 total steps resuming from checkpoint 4
+
+            resume_run_args = f"""
+                examples/vqgan/train_vqgan.py
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 32
+                --image_column image
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --model_config_name_or_path {vqmodel_config_path}
+                --discriminator_config_name_or_path {discriminator_config_path}
+                --checkpointing_steps=1
+                --resume_from_checkpoint={os.path.join(tmpdir, 'checkpoint-4')}
+                --output_dir {tmpdir}
+                --use_ema
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            # check can run new fully trained pipeline
+            model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            _ = model(image)
+
+            # no checkpoint-2 -> check old checkpoints do not exist
+            # check new checkpoints exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-4", "checkpoint-6"},
+            )
+
+    def test_vqmodel_checkpointing_checkpoints_total_limit(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir)
+            # Run training script with checkpointing
+            # max_train_steps == 6, checkpointing_steps == 2, checkpoints_total_limit == 2
+            # Should create checkpoints at steps 2, 4, 6
+            # with checkpoint at step 2 deleted
+
+            initial_run_args = f"""
+                examples/vqgan/train_vqgan.py
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 32
+                --image_column image
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 6
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --model_config_name_or_path {vqmodel_config_path}
+                --discriminator_config_name_or_path {discriminator_config_path}
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --checkpoints_total_limit=2
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            _ = model(image)
+
+            # check checkpoint directories exist
+            # checkpoint-2 should have been deleted
+            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-4", "checkpoint-6"})
+
+    def test_vqmodel_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
+        with tempfile.TemporaryDirectory() as tmpdir:
+            vqmodel_config_path, discriminator_config_path = self.get_vq_and_discriminator_configs(tmpdir)
+            # Run training script with checkpointing
+            # max_train_steps == 4, checkpointing_steps == 2
+            # Should create checkpoints at steps 2, 4
+
+            initial_run_args = f"""
+                examples/vqgan/train_vqgan.py
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 32
+                --image_column image
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 4
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --model_config_name_or_path {vqmodel_config_path}
+                --discriminator_config_name_or_path {discriminator_config_path}
+                --checkpointing_steps=2
+                --output_dir {tmpdir}
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + initial_run_args)
+
+            model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            _ = model(image)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-2", "checkpoint-4"},
+            )
+
+            # resume and we should try to checkpoint at 6, where we'll have to remove
+            # checkpoint-2 and checkpoint-4 instead of just a single previous checkpoint
+
+            resume_run_args = f"""
+                examples/vqgan/train_vqgan.py
+                --dataset_name hf-internal-testing/dummy_image_text_data
+                --resolution 32
+                --image_column image
+                --train_batch_size 1
+                --gradient_accumulation_steps 1
+                --max_train_steps 8
+                --learning_rate 5.0e-04
+                --scale_lr
+                --lr_scheduler constant
+                --lr_warmup_steps 0
+                --model_config_name_or_path {vqmodel_config_path}
+                --discriminator_config_name_or_path {discriminator_config_path}
+                --output_dir {tmpdir}
+                --checkpointing_steps=2
+                --resume_from_checkpoint={os.path.join(tmpdir, 'checkpoint-4')}
+                --checkpoints_total_limit=2
+                --seed=0
+                """.split()
+
+            run_command(self._launch_args + resume_run_args)
+
+            model = VQModel.from_pretrained(tmpdir, subfolder="vqmodel")
+            image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+            _ = model(image)
+
+            # check checkpoint directories exist
+            self.assertEqual(
+                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
+                {"checkpoint-6", "checkpoint-8"},
+            )
@@ -0,0 +1,156 @@
+from typing import Any, Dict, List
+
+from .configuration_utils import ConfigMixin, register_to_config
+from .utils import CONFIG_NAME
+
+
+class PipelineCallback(ConfigMixin):
+    """
+    Base class for all the official callbacks used in a pipeline. This class provides a structure for implementing
+    custom callbacks and ensures that all callbacks have a consistent interface.
+
+    Please implement the following:
+        `tensor_inputs`: This should return a list of tensor inputs specific to your callback. You will only be able to
+        include
+            variables listed in the `._callback_tensor_inputs` attribute of your pipeline class.
+        `callback_fn`: This method defines the core functionality of your callback.
+    """
+
+    config_name = CONFIG_NAME
+
+    @register_to_config
+    def __init__(self, cutoff_step_ratio=1.0, cutoff_step_index=None):
+        super().__init__()
+
+        if (cutoff_step_ratio is None and cutoff_step_index is None) or (
+            cutoff_step_ratio is not None and cutoff_step_index is not None
+        ):
+            raise ValueError("Either cutoff_step_ratio or cutoff_step_index should be provided, not both or none.")
+
+        if cutoff_step_ratio is not None and (
+            not isinstance(cutoff_step_ratio, float) or not (0.0 <= cutoff_step_ratio <= 1.0)
+        ):
+            raise ValueError("cutoff_step_ratio must be a float between 0.0 and 1.0.")
+
+    @property
+    def tensor_inputs(self) -> List[str]:
+        raise NotImplementedError(f"You need to set the attribute `tensor_inputs` for {self.__class__}")
+
+    def callback_fn(self, pipeline, step_index, timesteps, callback_kwargs) -> Dict[str, Any]:
+        raise NotImplementedError(f"You need to implement the method `callback_fn` for {self.__class__}")
+
+    def __call__(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        return self.callback_fn(pipeline, step_index, timestep, callback_kwargs)
+
+
+class MultiPipelineCallbacks:
+    """
+    This class is designed to handle multiple pipeline callbacks. It accepts a list of PipelineCallback objects and
+    provides a unified interface for calling all of them.
+    """
+
+    def __init__(self, callbacks: List[PipelineCallback]):
+        self.callbacks = callbacks
+
+    @property
+    def tensor_inputs(self) -> List[str]:
+        return [input for callback in self.callbacks for input in callback.tensor_inputs]
+
+    def __call__(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        """
+        Calls all the callbacks in order with the given arguments and returns the final callback_kwargs.
+        """
+        for callback in self.callbacks:
+            callback_kwargs = callback(pipeline, step_index, timestep, callback_kwargs)
+
+        return callback_kwargs
+
+
+class SDCFGCutoffCallback(PipelineCallback):
+    """
+    Callback function for Stable Diffusion Pipelines. After certain number of steps (set by `cutoff_step_ratio` or
+    `cutoff_step_index`), this callback will disable the CFG.
+
+    Note: This callback mutates the pipeline by changing the `_guidance_scale` attribute to 0.0 after the cutoff step.
+    """
+
+    tensor_inputs = ["prompt_embeds"]
+
+    def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        cutoff_step_ratio = self.config.cutoff_step_ratio
+        cutoff_step_index = self.config.cutoff_step_index
+
+        # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
+        cutoff_step = (
+            cutoff_step_index if cutoff_step_index is not None else int(pipeline.num_timesteps * cutoff_step_ratio)
+        )
+
+        if step_index == cutoff_step:
+            prompt_embeds = callback_kwargs[self.tensor_inputs[0]]
+            prompt_embeds = prompt_embeds[-1:]  # "-1" denotes the embeddings for conditional text tokens.
+
+            pipeline._guidance_scale = 0.0
+
+            callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
+        return callback_kwargs
+
+
+class SDXLCFGCutoffCallback(PipelineCallback):
+    """
+    Callback function for Stable Diffusion XL Pipelines. After certain number of steps (set by `cutoff_step_ratio` or
+    `cutoff_step_index`), this callback will disable the CFG.
+
+    Note: This callback mutates the pipeline by changing the `_guidance_scale` attribute to 0.0 after the cutoff step.
+    """
+
+    tensor_inputs = ["prompt_embeds", "add_text_embeds", "add_time_ids"]
+
+    def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        cutoff_step_ratio = self.config.cutoff_step_ratio
+        cutoff_step_index = self.config.cutoff_step_index
+
+        # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
+        cutoff_step = (
+            cutoff_step_index if cutoff_step_index is not None else int(pipeline.num_timesteps * cutoff_step_ratio)
+        )
+
+        if step_index == cutoff_step:
+            prompt_embeds = callback_kwargs[self.tensor_inputs[0]]
+            prompt_embeds = prompt_embeds[-1:]  # "-1" denotes the embeddings for conditional text tokens.
+
+            add_text_embeds = callback_kwargs[self.tensor_inputs[1]]
+            add_text_embeds = add_text_embeds[-1:]  # "-1" denotes the embeddings for conditional pooled text tokens
+
+            add_time_ids = callback_kwargs[self.tensor_inputs[2]]
+            add_time_ids = add_time_ids[-1:]  # "-1" denotes the embeddings for conditional added time vector
+
+            pipeline._guidance_scale = 0.0
+
+            callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
+            callback_kwargs[self.tensor_inputs[1]] = add_text_embeds
+            callback_kwargs[self.tensor_inputs[2]] = add_time_ids
+        return callback_kwargs
+
+
+class IPAdapterScaleCutoffCallback(PipelineCallback):
+    """
+    Callback function for any pipeline that inherits `IPAdapterMixin`. After certain number of steps (set by
+    `cutoff_step_ratio` or `cutoff_step_index`), this callback will set the IP Adapter scale to `0.0`.
+
+    Note: This callback mutates the IP Adapter attention processors by setting the scale to 0.0 after the cutoff step.
+    """
+
+    tensor_inputs = []
+
+    def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
+        cutoff_step_ratio = self.config.cutoff_step_ratio
+        cutoff_step_index = self.config.cutoff_step_index
+
+        # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
+        cutoff_step = (
+            cutoff_step_index if cutoff_step_index is not None else int(pipeline.num_timesteps * cutoff_step_ratio)
+        )
+
+        if step_index == cutoff_step:
+            pipeline.set_ip_adapter_scale(0.0)
+        return callback_kwargs
@@ -13,12 +13,25 @@
 # limitations under the License.

 import platform
+import subprocess
 from argparse import ArgumentParser

 import huggingface_hub

 from .. import __version__ as version
-from ..utils import is_accelerate_available, is_torch_available, is_transformers_available, is_xformers_available
+from ..utils import (
+    is_accelerate_available,
+    is_bitsandbytes_available,
+    is_flax_available,
+    is_google_colab,
+    is_notebook,
+    is_peft_available,
+    is_safetensors_available,
+    is_torch_available,
+    is_transformers_available,
+    is_xformers_available,
+)
+from ..utils.testing_utils import get_python_version
 from . import BaseDiffusersCLICommand


@@ -28,13 +41,19 @@ def info_command_factory(_):

 class EnvironmentCommand(BaseDiffusersCLICommand):
    @staticmethod
-    def register_subcommand(parser: ArgumentParser):
+    def register_subcommand(parser: ArgumentParser) -> None:
        download_parser = parser.add_parser("env")
        download_parser.set_defaults(func=info_command_factory)

-    def run(self):
+    def run(self) -> dict:
        hub_version = huggingface_hub.__version__

+        safetensors_version = "not installed"
+        if is_safetensors_available():
+            import safetensors
+
+            safetensors_version = safetensors.__version__
+
        pt_version = "not installed"
        pt_cuda_available = "NA"
        if is_torch_available():
@@ -43,6 +62,20 @@ class EnvironmentCommand(BaseDiffusersCLICommand):
            pt_version = torch.__version__
            pt_cuda_available = torch.cuda.is_available()

+        flax_version = "not installed"
+        jax_version = "not installed"
+        jaxlib_version = "not installed"
+        jax_backend = "NA"
+        if is_flax_available():
+            import flax
+            import jax
+            import jaxlib
+
+            flax_version = flax.__version__
+            jax_version = jax.__version__
+            jaxlib_version = jaxlib.__version__
+            jax_backend = jax.lib.xla_bridge.get_backend().platform
+
        transformers_version = "not installed"
        if is_transformers_available():
            import transformers
@@ -55,21 +88,92 @@ class EnvironmentCommand(BaseDiffusersCLICommand):

            accelerate_version = accelerate.__version__

+        peft_version = "not installed"
+        if is_peft_available():
+            import peft
+
+            peft_version = peft.__version__
+
+        bitsandbytes_version = "not installed"
+        if is_bitsandbytes_available():
+            import bitsandbytes
+
+            bitsandbytes_version = bitsandbytes.__version__
+
        xformers_version = "not installed"
        if is_xformers_available():
            import xformers

            xformers_version = xformers.__version__

+        if get_python_version() >= (3, 10):
+            platform_info = f"{platform.freedesktop_os_release().get('PRETTY_NAME', None)} - {platform.platform()}"
+        else:
+            platform_info = platform.platform()
+
+        is_notebook_str = "Yes" if is_notebook() else "No"
+
+        is_google_colab_str = "Yes" if is_google_colab() else "No"
+
+        accelerator = "NA"
+        if platform.system() in {"Linux", "Windows"}:
+            try:
+                sp = subprocess.Popen(
+                    ["nvidia-smi", "--query-gpu=gpu_name,memory.total", "--format=csv,noheader"],
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                )
+                out_str, _ = sp.communicate()
+                out_str = out_str.decode("utf-8")
+
+                if len(out_str) > 0:
+                    accelerator = out_str.strip() + " VRAM"
+            except FileNotFoundError:
+                pass
+        elif platform.system() == "Darwin":  # Mac OS
+            try:
+                sp = subprocess.Popen(
+                    ["system_profiler", "SPDisplaysDataType"],
+                    stdout=subprocess.PIPE,
+                    stderr=subprocess.PIPE,
+                )
+                out_str, _ = sp.communicate()
+                out_str = out_str.decode("utf-8")
+
+                start = out_str.find("Chipset Model:")
+                if start != -1:
+                    start += len("Chipset Model:")
+                    end = out_str.find("\n", start)
+                    accelerator = out_str[start:end].strip()
+
+                    start = out_str.find("VRAM (Total):")
+                    if start != -1:
+                        start += len("VRAM (Total):")
+                        end = out_str.find("\n", start)
+                        accelerator += " VRAM: " + out_str[start:end].strip()
+            except FileNotFoundError:
+                pass
+        else:
+            print("It seems you are running an unusual OS. Could you fill in the accelerator manually?")
+
        info = {
-            "`diffusers` version": version,
-            "Platform": platform.platform(),
+            "🤗 Diffusers version": version,
+            "Platform": platform_info,
+            "Running on a notebook?": is_notebook_str,
+            "Running on Google Colab?": is_google_colab_str,
            "Python version": platform.python_version(),
            "PyTorch version (GPU?)": f"{pt_version} ({pt_cuda_available})",
+            "Flax version (CPU?/GPU?/TPU?)": f"{flax_version} ({jax_backend})",
+            "Jax version": jax_version,
+            "JaxLib version": jaxlib_version,
            "Huggingface_hub version": hub_version,
            "Transformers version": transformers_version,
            "Accelerate version": accelerate_version,
+            "PEFT version": peft_version,
+            "Bitsandbytes version": bitsandbytes_version,
+            "Safetensors version": safetensors_version,
            "xFormers version": xformers_version,
+            "Accelerator": accelerator,
            "Using GPU in script?": "<fill in>",
            "Using distributed or parallel set-up in script?": "<fill in>",
        }
@@ -80,5 +184,5 @@ class EnvironmentCommand(BaseDiffusersCLICommand):
        return info

    @staticmethod
-    def format_dict(d):
+    def format_dict(d: dict) -> str:
        return "\n".join([f"- {prop}: {val}" for prop, val in d.items()]) + "\n"
@@ -363,7 +363,7 @@ class LoraLoaderMixin:
        is_model_cpu_offload = False
        is_sequential_cpu_offload = False

-        if _pipeline is not None:
+        if _pipeline is not None and _pipeline.hf_device_map is None:
            for _, component in _pipeline.components.items():
                if isinstance(component, nn.Module) and hasattr(component, "_hf_hook"):
                    if not is_model_cpu_offload:
@@ -826,8 +826,8 @@ def convert_ldm_unet_checkpoint(checkpoint, config, extract_ema=False, **kwargs)

    # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
    if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
-        logger.warninging("Checkpoint has both EMA and non-EMA weights.")
-        logger.warninging(
+        logger.warning("Checkpoint has both EMA and non-EMA weights.")
+        logger.warning(
            "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
            " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
        )
@@ -837,7 +837,7 @@ def convert_ldm_unet_checkpoint(checkpoint, config, extract_ema=False, **kwargs)
                unet_state_dict[key.replace(unet_key, "")] = checkpoint.get(flat_ema_key)
    else:
        if sum(k.startswith("model_ema") for k in keys) > 100:
-            logger.warninging(
+            logger.warning(
                "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
                " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
            )
@@ -419,19 +419,20 @@ class TextualInversionLoaderMixin:
        # 7.1 Offload all hooks in case the pipeline was cpu offloaded before make sure, we offload and onload again
        is_model_cpu_offload = False
        is_sequential_cpu_offload = False
-        for _, component in self.components.items():
-            if isinstance(component, nn.Module):
-                if hasattr(component, "_hf_hook"):
-                    is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
-                    is_sequential_cpu_offload = (
-                        isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
-                        or hasattr(component._hf_hook, "hooks")
-                        and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
-                    )
-                    logger.info(
-                        "Accelerate hooks detected. Since you have called `load_textual_inversion()`, the previous hooks will be first removed. Then the textual inversion parameters will be loaded and the hooks will be applied again."
-                    )
-                    remove_hook_from_module(component, recurse=is_sequential_cpu_offload)
+        if self.hf_device_map is None:
+            for _, component in self.components.items():
+                if isinstance(component, nn.Module):
+                    if hasattr(component, "_hf_hook"):
+                        is_model_cpu_offload = isinstance(getattr(component, "_hf_hook"), CpuOffload)
+                        is_sequential_cpu_offload = (
+                            isinstance(getattr(component, "_hf_hook"), AlignDevicesHook)
+                            or hasattr(component._hf_hook, "hooks")
+                            and isinstance(component._hf_hook.hooks[0], AlignDevicesHook)
+                        )
+                        logger.info(
+                            "Accelerate hooks detected. Since you have called `load_textual_inversion()`, the previous hooks will be first removed. Then the textual inversion parameters will be loaded and the hooks will be applied again."
+                        )
+                        remove_hook_from_module(component, recurse=is_sequential_cpu_offload)

        # 7.2 save expected device and dtype
        device = text_encoder.device
@@ -300,7 +300,7 @@ class AutoencoderKL(ModelMixin, ConfigMixin, FromOriginalModelMixin):
            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
            decoded = torch.cat(decoded_slices)
        else:
-            decoded = self._decode(z).sample
+            decoded = self._decode(z, return_dict=False)[0]

        if not return_dict:
            return (decoded,)
@@ -41,6 +41,7 @@ class DecoderOutput(BaseOutput):
    """

    sample: torch.Tensor
+    commit_loss: Optional[torch.FloatTensor] = None


 class Encoder(nn.Module):
@@ -0,0 +1,149 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+# Copyright (c) 2022, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import os
+from collections import OrderedDict
+from typing import List, Optional, Union
+
+import safetensors
+import torch
+
+from ..utils import (
+    SAFETENSORS_FILE_EXTENSION,
+    is_accelerate_available,
+    is_torch_version,
+    logging,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_accelerate_available():
+    from accelerate import infer_auto_device_map
+    from accelerate.utils import get_balanced_memory, get_max_memory, set_module_tensor_to_device
+
+
+# Adapted from `transformers` (see modeling_utils.py)
+def _determine_device_map(model: torch.nn.Module, device_map, max_memory, torch_dtype):
+    if isinstance(device_map, str):
+        no_split_modules = model._get_no_split_modules(device_map)
+        device_map_kwargs = {"no_split_module_classes": no_split_modules}
+
+        if device_map != "sequential":
+            max_memory = get_balanced_memory(
+                model,
+                dtype=torch_dtype,
+                low_zero=(device_map == "balanced_low_0"),
+                max_memory=max_memory,
+                **device_map_kwargs,
+            )
+        else:
+            max_memory = get_max_memory(max_memory)
+
+        device_map_kwargs["max_memory"] = max_memory
+        device_map = infer_auto_device_map(model, dtype=torch_dtype, **device_map_kwargs)
+
+    return device_map
+
+
+def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[str] = None):
+    """
+    Reads a checkpoint file, returning properly formatted errors if they arise.
+    """
+    try:
+        file_extension = os.path.basename(checkpoint_file).split(".")[-1]
+        if file_extension == SAFETENSORS_FILE_EXTENSION:
+            return safetensors.torch.load_file(checkpoint_file, device="cpu")
+        else:
+            weights_only_kwarg = {"weights_only": True} if is_torch_version(">=", "1.13") else {}
+            return torch.load(
+                checkpoint_file,
+                map_location="cpu",
+                **weights_only_kwarg,
+            )
+    except Exception as e:
+        try:
+            with open(checkpoint_file) as f:
+                if f.read().startswith("version"):
+                    raise OSError(
+                        "You seem to have cloned a repository without having git-lfs installed. Please install "
+                        "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
+                        "you cloned."
+                    )
+                else:
+                    raise ValueError(
+                        f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained "
+                        "model. Make sure you have saved the model properly."
+                    ) from e
+        except (UnicodeDecodeError, ValueError):
+            raise OSError(
+                f"Unable to load weights from checkpoint file for '{checkpoint_file}' " f"at '{checkpoint_file}'. "
+            )
+
+
+def load_model_dict_into_meta(
+    model,
+    state_dict: OrderedDict,
+    device: Optional[Union[str, torch.device]] = None,
+    dtype: Optional[Union[str, torch.dtype]] = None,
+    model_name_or_path: Optional[str] = None,
+) -> List[str]:
+    device = device or torch.device("cpu")
+    dtype = dtype or torch.float32
+
+    accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys())
+
+    unexpected_keys = []
+    empty_state_dict = model.state_dict()
+    for param_name, param in state_dict.items():
+        if param_name not in empty_state_dict:
+            unexpected_keys.append(param_name)
+            continue
+
+        if empty_state_dict[param_name].shape != param.shape:
+            model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else ""
+            raise ValueError(
+                f"Cannot load {model_name_or_path_str}because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
+            )
+
+        if accepts_dtype:
+            set_module_tensor_to_device(model, param_name, device, value=param, dtype=dtype)
+        else:
+            set_module_tensor_to_device(model, param_name, device, value=param)
+    return unexpected_keys
+
+
+def _load_state_dict_into_model(model_to_load, state_dict: OrderedDict) -> List[str]:
+    # Convert old format to new format if needed from a PyTorch state_dict
+    # copy state_dict so _load_from_state_dict can modify it
+    state_dict = state_dict.copy()
+    error_msgs = []
+
+    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+    # so we need to apply the function recursively.
+    def load(module: torch.nn.Module, prefix: str = ""):
+        args = (state_dict, prefix, {}, True, [], [], error_msgs)
+        module._load_from_state_dict(*args)
+
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, prefix + name + ".")
+
+    load(model_to_load)
+
+    return error_msgs
@@ -33,7 +33,6 @@ from .. import __version__
 from ..utils import (
    CONFIG_NAME,
    FLAX_WEIGHTS_NAME,
-    SAFETENSORS_FILE_EXTENSION,
    SAFETENSORS_WEIGHTS_NAME,
    WEIGHTS_NAME,
    _add_variant,
@@ -44,6 +43,12 @@ from ..utils import (
    logging,
 )
 from ..utils.hub_utils import PushToHubMixin, load_or_create_model_card, populate_model_card
+from .model_loading_utils import (
+    _determine_device_map,
+    _load_state_dict_into_model,
+    load_model_dict_into_meta,
+    load_state_dict,
+)


 logger = logging.get_logger(__name__)
@@ -57,9 +62,6 @@ else:

 if is_accelerate_available():
    import accelerate
-    from accelerate import infer_auto_device_map
-    from accelerate.utils import get_balanced_memory, get_max_memory, set_module_tensor_to_device
-    from accelerate.utils.versions import is_torch_version


 def get_parameter_device(parameter: torch.nn.Module) -> torch.device:
@@ -100,117 +102,6 @@ def get_parameter_dtype(parameter: torch.nn.Module) -> torch.dtype:
        return first_tuple[1].dtype


-# Adapted from `transformers` (see modeling_utils.py)
-def _determine_device_map(model: "ModelMixin", device_map, max_memory, torch_dtype):
-    if isinstance(device_map, str):
-        no_split_modules = model._get_no_split_modules(device_map)
-        device_map_kwargs = {"no_split_module_classes": no_split_modules}
-
-        if device_map != "sequential":
-            max_memory = get_balanced_memory(
-                model,
-                dtype=torch_dtype,
-                low_zero=(device_map == "balanced_low_0"),
-                max_memory=max_memory,
-                **device_map_kwargs,
-            )
-        else:
-            max_memory = get_max_memory(max_memory)
-
-        device_map_kwargs["max_memory"] = max_memory
-        device_map = infer_auto_device_map(model, dtype=torch_dtype, **device_map_kwargs)
-
-    return device_map
-
-
-def load_state_dict(checkpoint_file: Union[str, os.PathLike], variant: Optional[str] = None):
-    """
-    Reads a checkpoint file, returning properly formatted errors if they arise.
-    """
-    try:
-        file_extension = os.path.basename(checkpoint_file).split(".")[-1]
-        if file_extension == SAFETENSORS_FILE_EXTENSION:
-            return safetensors.torch.load_file(checkpoint_file, device="cpu")
-        else:
-            weights_only_kwarg = {"weights_only": True} if is_torch_version(">=", "1.13") else {}
-            return torch.load(
-                checkpoint_file,
-                map_location="cpu",
-                **weights_only_kwarg,
-            )
-    except Exception as e:
-        try:
-            with open(checkpoint_file) as f:
-                if f.read().startswith("version"):
-                    raise OSError(
-                        "You seem to have cloned a repository without having git-lfs installed. Please install "
-                        "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
-                        "you cloned."
-                    )
-                else:
-                    raise ValueError(
-                        f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained "
-                        "model. Make sure you have saved the model properly."
-                    ) from e
-        except (UnicodeDecodeError, ValueError):
-            raise OSError(
-                f"Unable to load weights from checkpoint file for '{checkpoint_file}' " f"at '{checkpoint_file}'. "
-            )
-
-
-def load_model_dict_into_meta(
-    model,
-    state_dict: OrderedDict,
-    device: Optional[Union[str, torch.device]] = None,
-    dtype: Optional[Union[str, torch.dtype]] = None,
-    model_name_or_path: Optional[str] = None,
-) -> List[str]:
-    device = device or torch.device("cpu")
-    dtype = dtype or torch.float32
-
-    accepts_dtype = "dtype" in set(inspect.signature(set_module_tensor_to_device).parameters.keys())
-
-    unexpected_keys = []
-    empty_state_dict = model.state_dict()
-    for param_name, param in state_dict.items():
-        if param_name not in empty_state_dict:
-            unexpected_keys.append(param_name)
-            continue
-
-        if empty_state_dict[param_name].shape != param.shape:
-            model_name_or_path_str = f"{model_name_or_path} " if model_name_or_path is not None else ""
-            raise ValueError(
-                f"Cannot load {model_name_or_path_str}because {param_name} expected shape {empty_state_dict[param_name]}, but got {param.shape}. If you want to instead overwrite randomly initialized weights, please make sure to pass both `low_cpu_mem_usage=False` and `ignore_mismatched_sizes=True`. For more information, see also: https://github.com/huggingface/diffusers/issues/1619#issuecomment-1345604389 as an example."
-            )
-
-        if accepts_dtype:
-            set_module_tensor_to_device(model, param_name, device, value=param, dtype=dtype)
-        else:
-            set_module_tensor_to_device(model, param_name, device, value=param)
-    return unexpected_keys
-
-
-def _load_state_dict_into_model(model_to_load, state_dict: OrderedDict) -> List[str]:
-    # Convert old format to new format if needed from a PyTorch state_dict
-    # copy state_dict so _load_from_state_dict can modify it
-    state_dict = state_dict.copy()
-    error_msgs = []
-
-    # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
-    # so we need to apply the function recursively.
-    def load(module: torch.nn.Module, prefix: str = ""):
-        args = (state_dict, prefix, {}, True, [], [], error_msgs)
-        module._load_from_state_dict(*args)
-
-        for name, child in module._modules.items():
-            if child is not None:
-                load(child, prefix + name + ".")
-
-    load(model_to_load)
-
-    return error_msgs
-
-
 class ModelMixin(torch.nn.Module, PushToHubMixin):
    r"""
    Base class for all models.
@@ -685,7 +685,7 @@ class UNet2DConditionModel(
            positive_len = 768
            if isinstance(cross_attention_dim, int):
                positive_len = cross_attention_dim
-            elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
+            elif isinstance(cross_attention_dim, (list, tuple)):
                positive_len = cross_attention_dim[0]

            feature_type = "text-only" if attention_type == "gated" else "text-image"
@@ -15,6 +15,7 @@ from typing import Any, Dict, Optional, Tuple, Union

 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 import torch.utils.checkpoint

 from ...configuration_utils import ConfigMixin, FrozenDict, register_to_config
@@ -27,6 +28,9 @@ from ..attention_processor import (
    AttentionProcessor,
    AttnAddedKVProcessor,
    AttnProcessor,
+    AttnProcessor2_0,
+    IPAdapterAttnProcessor,
+    IPAdapterAttnProcessor2_0,
 )
 from ..embeddings import TimestepEmbedding, Timesteps
 from ..modeling_utils import ModelMixin
@@ -490,6 +494,36 @@ class UNetMotionModel(ModelMixin, ConfigMixin, UNet2DConditionLoadersMixin):
        model.time_proj.load_state_dict(unet.time_proj.state_dict())
        model.time_embedding.load_state_dict(unet.time_embedding.state_dict())

+        if any(
+            isinstance(proc, (IPAdapterAttnProcessor, IPAdapterAttnProcessor2_0))
+            for proc in unet.attn_processors.values()
+        ):
+            attn_procs = {}
+            for name, processor in unet.attn_processors.items():
+                if name.endswith("attn1.processor"):
+                    attn_processor_class = (
+                        AttnProcessor2_0 if hasattr(F, "scaled_dot_product_attention") else AttnProcessor
+                    )
+                    attn_procs[name] = attn_processor_class()
+                else:
+                    attn_processor_class = (
+                        IPAdapterAttnProcessor2_0
+                        if hasattr(F, "scaled_dot_product_attention")
+                        else IPAdapterAttnProcessor
+                    )
+                    attn_procs[name] = attn_processor_class(
+                        hidden_size=processor.hidden_size,
+                        cross_attention_dim=processor.cross_attention_dim,
+                        scale=processor.scale,
+                        num_tokens=processor.num_tokens,
+                    )
+            for name, processor in model.attn_processors.items():
+                if name not in attn_procs:
+                    attn_procs[name] = processor.__class__()
+            model.set_attn_processor(attn_procs)
+            model.config.encoder_hid_dim_type = "ip_image_proj"
+            model.encoder_hid_proj = unet.encoder_hid_proj
+
        for i, down_block in enumerate(unet.down_blocks):
            model.down_blocks[i].resnets.load_state_dict(down_block.resnets.state_dict())
            if hasattr(model.down_blocks[i], "attentions"):
@@ -142,18 +142,20 @@ class VQModel(ModelMixin, ConfigMixin):
    ) -> Union[DecoderOutput, torch.Tensor]:
        # also go through quantization layer
        if not force_not_quantize:
-            quant, _, _ = self.quantize(h)
+            quant, commit_loss, _ = self.quantize(h)
        elif self.config.lookup_from_codebook:
            quant = self.quantize.get_codebook_entry(h, shape)
+            commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
        else:
            quant = h
+            commit_loss = torch.zeros((h.shape[0])).to(h.device, dtype=h.dtype)
        quant2 = self.post_quant_conv(quant)
        dec = self.decoder(quant2, quant if self.config.norm_type == "spatial" else None)

        if not return_dict:
-            return (dec,)
+            return dec, commit_loss

-        return DecoderOutput(sample=dec)
+        return DecoderOutput(sample=dec, commit_loss=commit_loss)

    def forward(
        self, sample: torch.Tensor, return_dict: bool = True
@@ -173,9 +175,8 @@ class VQModel(ModelMixin, ConfigMixin):
        """

        h = self.encode(sample).latents
-        dec = self.decode(h).sample
+        dec = self.decode(h)

        if not return_dict:
-            return (dec,)
-
-        return DecoderOutput(sample=dec)
+            return dec.sample, dec.commit_loss
+        return dec
@@ -13,7 +13,7 @@ class AnimateDiffPipelineOutput(BaseOutput):
    r"""
     Output class for AnimateDiff pipelines.

-     Args:
+    Args:
         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
             denoised
@@ -100,20 +100,16 @@ class MultiControlNetModel(ModelMixin):
            variant (`str`, *optional*):
                If specified, weights are saved in the format pytorch_model.<variant>.bin.
        """
-        idx = 0
-        model_path_to_save = save_directory
-        for controlnet in self.nets:
+        for idx, controlnet in enumerate(self.nets):
+            suffix = "" if idx == 0 else f"_{idx}"
            controlnet.save_pretrained(
-                model_path_to_save,
+                save_directory + suffix,
                is_main_process=is_main_process,
                save_function=save_function,
                safe_serialization=safe_serialization,
                variant=variant,
            )

-            idx += 1
-            model_path_to_save = model_path_to_save + f"_{idx}"
-
    @classmethod
    def from_pretrained(cls, pretrained_model_path: Optional[Union[str, os.PathLike]], **kwargs):
        r"""
@@ -22,6 +22,7 @@ import torch
 import torch.nn.functional as F
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
@@ -926,7 +927,9 @@ class StableDiffusionControlNetPipeline(
        control_guidance_start: Union[float, List[float]] = 0.0,
        control_guidance_end: Union[float, List[float]] = 1.0,
        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        **kwargs,
    ):
@@ -1019,11 +1022,11 @@ class StableDiffusionControlNetPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -1055,6 +1058,9 @@ class StableDiffusionControlNetPipeline(
                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
            )

+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet

        # align format for control guidance
@@ -21,6 +21,7 @@ import torch
 import torch.nn.functional as F
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
@@ -917,7 +918,9 @@ class StableDiffusionControlNetImg2ImgPipeline(
        control_guidance_start: Union[float, List[float]] = 0.0,
        control_guidance_end: Union[float, List[float]] = 1.0,
        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        **kwargs,
    ):
@@ -1004,11 +1007,11 @@ class StableDiffusionControlNetImg2ImgPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -1040,6 +1043,9 @@ class StableDiffusionControlNetImg2ImgPipeline(
                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
            )

+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet

        # align format for control guidance
@@ -23,6 +23,7 @@ import torch
 import torch.nn.functional as F
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetModel, ImageProjection, UNet2DConditionModel
@@ -1134,7 +1135,9 @@ class StableDiffusionControlNetInpaintPipeline(
        control_guidance_start: Union[float, List[float]] = 0.0,
        control_guidance_end: Union[float, List[float]] = 1.0,
        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        **kwargs,
    ):
@@ -1239,11 +1242,11 @@ class StableDiffusionControlNetInpaintPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -1275,6 +1278,9 @@ class StableDiffusionControlNetInpaintPipeline(
                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
            )

+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet

        # align format for control guidance
@@ -27,6 +27,7 @@ from transformers import (
    CLIPVisionModelWithProjection,
 )

+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import (
    FromSingleFileMixin,
@@ -197,8 +198,26 @@ class StableDiffusionXLControlNetInpaintPipeline(
    """

    model_cpu_offload_seq = "text_encoder->text_encoder_2->unet->vae"
-    _optional_components = ["tokenizer", "tokenizer_2", "text_encoder", "text_encoder_2"]
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+
+    _optional_components = [
+        "tokenizer",
+        "tokenizer_2",
+        "text_encoder",
+        "text_encoder_2",
+        "image_encoder",
+        "feature_extractor",
+    ]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "add_neg_time_ids",
+        "mask",
+        "masked_image_latents",
+    ]

    def __init__(
        self,
@@ -208,7 +227,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
        tokenizer: CLIPTokenizer,
        tokenizer_2: CLIPTokenizer,
        unet: UNet2DConditionModel,
-        controlnet: ControlNetModel,
+        controlnet: Union[ControlNetModel, List[ControlNetModel], Tuple[ControlNetModel], MultiControlNetModel],
        scheduler: KarrasDiffusionSchedulers,
        requires_aesthetics_score: bool = False,
        force_zeros_for_empty_prompt: bool = True,
@@ -1178,7 +1197,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
        aesthetic_score: float = 6.0,
        negative_aesthetic_score: float = 2.5,
        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        **kwargs,
    ):
@@ -1317,11 +1338,11 @@ class StableDiffusionXLControlNetInpaintPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -1351,6 +1372,9 @@ class StableDiffusionXLControlNetInpaintPipeline(
                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
            )

+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet

        # align format for control guidance
@@ -1730,7 +1754,7 @@ class StableDiffusionXLControlNetInpaintPipeline(
                    down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
                    mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample])

-                if ip_adapter_image is not None:
+                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
                    added_cond_kwargs["image_embeds"] = image_embeds

                if num_channels_unet == 9:
@@ -30,6 +30,7 @@ from transformers import (

 from diffusers.utils.import_utils import is_invisible_watermark_available

+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import (
    FromSingleFileMixin,
@@ -235,7 +236,15 @@ class StableDiffusionXLControlNetPipeline(
        "feature_extractor",
        "image_encoder",
    ]
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]

    def __init__(
        self,
@@ -1031,7 +1040,9 @@ class StableDiffusionXLControlNetPipeline(
        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
        negative_target_size: Optional[Tuple[int, int]] = None,
        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        **kwargs,
    ):
@@ -1169,11 +1180,11 @@ class StableDiffusionXLControlNetPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -1203,6 +1214,9 @@ class StableDiffusionXLControlNetPipeline(
                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
            )

+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet

        # align format for control guidance
@@ -1522,6 +1536,12 @@ class StableDiffusionXLControlNetPipeline(
                    latents = callback_outputs.pop("latents", latents)
                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    negative_add_time_ids = callback_outputs.pop("negative_add_time_ids", negative_add_time_ids)

                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -30,6 +30,7 @@ from transformers import (

 from diffusers.utils.import_utils import is_invisible_watermark_available

+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import (
    FromSingleFileMixin,
@@ -227,7 +228,15 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
        "feature_extractor",
        "image_encoder",
    ]
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "add_neg_time_ids",
+    ]

    def __init__(
        self,
@@ -1105,7 +1114,9 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
        aesthetic_score: float = 6.0,
        negative_aesthetic_score: float = 2.5,
        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        **kwargs,
    ):
@@ -1254,11 +1265,11 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -1288,6 +1299,9 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
            )

+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
        controlnet = self.controlnet._orig_mod if is_compiled_module(self.controlnet) else self.controlnet

        # align format for control guidance
@@ -1578,6 +1592,12 @@ class StableDiffusionXLControlNetImg2ImgPipeline(
                    latents = callback_outputs.pop("latents", latents)
                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                    add_text_embeds = callback_outputs.pop("add_text_embeds", add_text_embeds)
+                    negative_pooled_prompt_embeds = callback_outputs.pop(
+                        "negative_pooled_prompt_embeds", negative_pooled_prompt_embeds
+                    )
+                    add_time_ids = callback_outputs.pop("add_time_ids", add_time_ids)
+                    add_neg_time_ids = callback_outputs.pop("add_neg_time_ids", add_neg_time_ids)

                # call the callback, if provided
                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
@@ -21,6 +21,7 @@ import torch
 import torch.nn.functional as F
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer

+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetXSAdapter, UNet2DConditionModel, UNetControlNetXSModel
@@ -648,7 +649,9 @@ class StableDiffusionControlNetXSPipeline(
        control_guidance_start: float = 0.0,
        control_guidance_end: float = 1.0,
        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
    ):
        r"""
@@ -715,11 +718,11 @@ class StableDiffusionControlNetXSPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -734,6 +737,9 @@ class StableDiffusionControlNetXSPipeline(
                "not-safe-for-work" (nsfw) content.
        """

+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
        unet = self.unet._orig_mod if is_compiled_module(self.unet) else self.unet

        # 1. Check inputs. Raise error if not correct
@@ -28,6 +28,7 @@ from transformers import (

 from diffusers.utils.import_utils import is_invisible_watermark_available

+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, StableDiffusionXLLoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ControlNetXSAdapter, UNet2DConditionModel, UNetControlNetXSModel
@@ -157,7 +158,15 @@ class StableDiffusionXLControlNetXSPipeline(
        "text_encoder_2",
        "feature_extractor",
    ]
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
+    _callback_tensor_inputs = [
+        "latents",
+        "prompt_embeds",
+        "negative_prompt_embeds",
+        "add_text_embeds",
+        "add_time_ids",
+        "negative_pooled_prompt_embeds",
+        "negative_add_time_ids",
+    ]

    def __init__(
        self,
@@ -739,7 +748,9 @@ class StableDiffusionXLControlNetXSPipeline(
        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
        negative_target_size: Optional[Tuple[int, int]] = None,
        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
    ):
        r"""
@@ -851,11 +862,11 @@ class StableDiffusionXLControlNetXSPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -869,6 +880,9 @@ class StableDiffusionXLControlNetXSPipeline(
                returned, otherwise a `tuple` is returned containing the output images.
        """

+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
        unet = self.unet._orig_mod if is_compiled_module(self.unet) else self.unet

        # 1. Check inputs. Raise error if not correct
@@ -817,7 +817,7 @@ class UNetFlatConditionModel(ModelMixin, ConfigMixin):
            positive_len = 768
            if isinstance(cross_attention_dim, int):
                positive_len = cross_attention_dim
-            elif isinstance(cross_attention_dim, tuple) or isinstance(cross_attention_dim, list):
+            elif isinstance(cross_attention_dim, (list, tuple)):
                positive_len = cross_attention_dim[0]

            feature_type = "text-only" if attention_type == "gated" else "text-image"
@@ -76,7 +76,7 @@ class I2VGenXLPipelineOutput(BaseOutput):
    r"""
     Output class for image-to-video pipeline.

-     Args:
+    Args:
         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
             denoised
@@ -1216,7 +1216,7 @@ class LEditsPPPipelineStableDiffusion(
        Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the
        inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead.

-         Args:
+        Args:
            image (`PipelineImageInput`):
                Input for the image(s) that are to be edited. Multiple input images have to default to the same aspect
                ratio.
@@ -1449,7 +1449,7 @@ class LEditsPPPipelineStableDiffusionXL(
        Paper](https://arxiv.org/abs/2301.12247). If the scheduler is set to [`~schedulers.DDIMScheduler`] the
        inversion proposed by [edit-friendly DPDM](https://arxiv.org/abs/2304.06140) will be performed instead.

-         Args:
+        Args:
            image (`PipelineImageInput`):
                Input for the image(s) that are to be edited. Multiple input images have to default to the same aspect
                ratio.
@@ -366,7 +366,7 @@ class PixArtAlphaPipeline(DiffusionPipeline):
            ):
                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    "The following part of your input was truncated because T5 can only handle sequences up to"
                    f" {max_length} tokens: {removed_text}"
                )

@@ -23,7 +23,7 @@ from transformers import T5EncoderModel, T5Tokenizer

 from ...image_processor import PixArtImageProcessor
 from ...models import AutoencoderKL, Transformer2DModel
-from ...schedulers import DPMSolverMultistepScheduler
+from ...schedulers import KarrasDiffusionSchedulers
 from ...utils import (
    BACKENDS_MAPPING,
    deprecate,
@@ -203,7 +203,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
        text_encoder: T5EncoderModel,
        vae: AutoencoderKL,
        transformer: Transformer2DModel,
-        scheduler: DPMSolverMultistepScheduler,
+        scheduler: KarrasDiffusionSchedulers,
    ):
        super().__init__()

@@ -214,7 +214,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
        self.image_processor = PixArtImageProcessor(vae_scale_factor=self.vae_scale_factor)

-    # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.encode_prompt
+    # Copied from diffusers.pipelines.pixart_alpha.pipeline_pixart_alpha.PixArtAlphaPipeline.encode_prompt with 120->300
    def encode_prompt(
        self,
        prompt: Union[str, List[str]],
@@ -227,7 +227,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
        prompt_attention_mask: Optional[torch.Tensor] = None,
        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
        clean_caption: bool = False,
-        max_sequence_length: int = 120,
+        max_sequence_length: int = 300,
        **kwargs,
    ):
        r"""
@@ -254,7 +254,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
                string.
            clean_caption (`bool`, defaults to `False`):
                If `True`, the function will preprocess and clean the provided caption before encoding.
-            max_sequence_length (`int`, defaults to 120): Maximum sequence length to use for the prompt.
+            max_sequence_length (`int`, defaults to 300): Maximum sequence length to use for the prompt.
        """

        if "mask_feature" in kwargs:
@@ -292,7 +292,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
            ):
                removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_length - 1 : -1])
                logger.warning(
-                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    "The following part of your input was truncated because T5 can only handle sequences up to"
                    f" {max_length} tokens: {removed_text}"
                )

@@ -707,7 +707,7 @@ class PixArtSigmaPipeline(DiffusionPipeline):
                If set to `True`, the requested height and width are first mapped to the closest resolutions using
                `ASPECT_RATIO_1024_BIN`. After the produced latents are decoded into images, they are resized back to
                the requested resolution. Useful for generating non-square images.
-            max_sequence_length (`int` defaults to 120): Maximum sequence length to use with the `prompt`.
+            max_sequence_length (`int` defaults to 300): Maximum sequence length to use with the `prompt`.

        Examples:

@@ -844,7 +844,7 @@ class ShapERenderer(ModelMixin, ConfigMixin):
        transmittance(t[i + 1]) := transmittance(t[i]). 4) The last term is integration to infinity (e.g. [t[-1],
        math.inf]) that is evaluated by the void_model (i.e. we consider this space to be empty).

-        args:
+        Args:
            rays: [batch_size x ... x 2 x 3] origin and direction. sampler: disjoint volume integrals. n_samples:
            number of ts to sample. prev_model_outputs: model outputs from the previous rendering step, including

@@ -197,7 +197,7 @@ class OnnxStableDiffusionUpscalePipeline(DiffusionPipeline):
            )

        # verify batch size of prompt and image are same if image is a list or tensor or numpy array
-        if isinstance(image, list) or isinstance(image, np.ndarray):
+        if isinstance(image, (list, np.ndarray)):
            if prompt is not None and isinstance(prompt, str):
                batch_size = 1
            elif prompt is not None and isinstance(prompt, list):
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Union

@@ -19,6 +18,7 @@ import torch
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...configuration_utils import FrozenDict
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
@@ -775,7 +775,9 @@ class StableDiffusionPipeline(
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
        guidance_rescale: float = 0.0,
        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        **kwargs,
    ):
@@ -845,11 +847,11 @@ class StableDiffusionPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -881,6 +883,9 @@ class StableDiffusionPipeline(
                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`",
            )

+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
        # 0. Default height and width to unet
        height = height or self.unet.config.sample_size * self.vae_scale_factor
        width = width or self.unet.config.sample_size * self.vae_scale_factor
@@ -21,6 +21,7 @@ import torch
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...configuration_utils import FrozenDict
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
@@ -862,7 +863,9 @@ class StableDiffusionImg2ImgPipeline(
        return_dict: bool = True,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
        clip_skip: int = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        **kwargs,
    ):
@@ -932,11 +935,11 @@ class StableDiffusionImg2ImgPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -967,6 +970,9 @@ class StableDiffusionImg2ImgPipeline(
                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
            )

+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
        # 1. Check inputs. Raise error if not correct
        self.check_inputs(
            prompt,
@@ -21,6 +21,7 @@ import torch
 from packaging import version
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...configuration_utils import FrozenDict
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
@@ -1014,7 +1015,9 @@ class StableDiffusionInpaintPipeline(
        return_dict: bool = True,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
        clip_skip: int = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        **kwargs,
    ):
@@ -1107,11 +1110,11 @@ class StableDiffusionInpaintPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -1171,6 +1174,9 @@ class StableDiffusionInpaintPipeline(
                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
            )

+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
        # 0. Default height and width to unet
        height = height or self.unet.config.sample_size * self.vae_scale_factor
        width = width or self.unet.config.sample_size * self.vae_scale_factor
@@ -13,13 +13,14 @@
 # limitations under the License.

 import inspect
-from typing import Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union

 import numpy as np
 import PIL.Image
 import torch
 from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection

+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin
 from ...models import AutoencoderKL, ImageProjection, UNet2DConditionModel
@@ -175,8 +176,11 @@ class StableDiffusionInstructPix2PixPipeline(
        ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
        output_type: Optional[str] = "pil",
        return_dict: bool = True,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ):
        r"""
@@ -227,15 +231,18 @@ class StableDiffusionInstructPix2PixPipeline(
            return_dict (`bool`, *optional*, defaults to `True`):
                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
                plain tuple.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
                `._callback_tensor_inputs` attribute of your pipeline class.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).

        Examples:

@@ -290,6 +297,9 @@ class StableDiffusionInstructPix2PixPipeline(
                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
            )

+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
        # 0. Check inputs
        self.check_inputs(
            prompt,
@@ -409,6 +419,7 @@ class StableDiffusionInstructPix2PixPipeline(
                    t,
                    encoder_hidden_states=prompt_embeds,
                    added_cond_kwargs=added_cond_kwargs,
+                    cross_attention_kwargs=cross_attention_kwargs,
                    return_dict=False,
                )[0]

@@ -221,7 +221,7 @@ class StableDiffusionLatentUpscalePipeline(DiffusionPipeline, StableDiffusionMix
            )

        # verify batch size of prompt and image are same if image is a list or tensor
-        if isinstance(image, list) or isinstance(image, torch.Tensor):
+        if isinstance(image, (list, torch.Tensor)):
            if isinstance(prompt, str):
                batch_size = 1
            else:
@@ -468,7 +468,7 @@ class StableDiffusionUpscalePipeline(
            )

        # verify batch size of prompt and image are same if image is a list or tensor or numpy array
-        if isinstance(image, list) or isinstance(image, torch.Tensor) or isinstance(image, np.ndarray):
+        if isinstance(image, (list, np.ndarray, torch.Tensor)):
            if prompt is not None and isinstance(prompt, str):
                batch_size = 1
            elif prompt is not None and isinstance(prompt, list):
@@ -185,7 +185,7 @@ def preprocess(image):
 def preprocess_mask(mask, batch_size: int = 1):
    if not isinstance(mask, torch.Tensor):
        # preprocess mask
-        if isinstance(mask, PIL.Image.Image) or isinstance(mask, np.ndarray):
+        if isinstance(mask, (PIL.Image.Image, np.ndarray)):
            mask = [mask]

        if isinstance(mask, list):
@@ -24,6 +24,7 @@ from transformers import (
    CLIPVisionModelWithProjection,
 )

+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import (
    FromSingleFileMixin,
@@ -861,7 +862,9 @@ class StableDiffusionXLPipeline(
        negative_crops_coords_top_left: Tuple[int, int] = (0, 0),
        negative_target_size: Optional[Tuple[int, int]] = None,
        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        **kwargs,
    ):
@@ -992,11 +995,11 @@ class StableDiffusionXLPipeline(
                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -1026,6 +1029,9 @@ class StableDiffusionXLPipeline(
                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
            )

+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
        # 0. Default height and width to unet
        height = height or self.default_sample_size * self.vae_scale_factor
        width = width or self.default_sample_size * self.vae_scale_factor
@@ -25,6 +25,7 @@ from transformers import (
    CLIPVisionModelWithProjection,
 )

+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import (
    FromSingleFileMixin,
@@ -1008,7 +1009,9 @@ class StableDiffusionXLImg2ImgPipeline(
        aesthetic_score: float = 6.0,
        negative_aesthetic_score: float = 2.5,
        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        **kwargs,
    ):
@@ -1157,11 +1160,11 @@ class StableDiffusionXLImg2ImgPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -1191,6 +1194,9 @@ class StableDiffusionXLImg2ImgPipeline(
                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
            )

+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
        # 1. Check inputs. Raise error if not correct
        self.check_inputs(
            prompt,
@@ -26,6 +26,7 @@ from transformers import (
    CLIPVisionModelWithProjection,
 )

+from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...image_processor import PipelineImageInput, VaeImageProcessor
 from ...loaders import (
    FromSingleFileMixin,
@@ -1243,7 +1244,9 @@ class StableDiffusionXLInpaintPipeline(
        aesthetic_score: float = 6.0,
        negative_aesthetic_score: float = 2.5,
        clip_skip: Optional[int] = None,
-        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end: Optional[
+            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
+        ] = None,
        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
        **kwargs,
    ):
@@ -1411,11 +1414,11 @@ class StableDiffusionXLInpaintPipeline(
            clip_skip (`int`, *optional*):
                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
+                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
+                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
+                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
+                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
            callback_on_step_end_tensor_inputs (`List`, *optional*):
                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
@@ -1445,6 +1448,9 @@ class StableDiffusionXLInpaintPipeline(
                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
            )

+        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
+            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
+
        # 0. Default height and width to unet
        height = height or self.unet.config.sample_size * self.vae_scale_factor
        width = width or self.unet.config.sample_size * self.vae_scale_factor
@@ -15,7 +15,7 @@ class TextToVideoSDPipelineOutput(BaseOutput):
    """
     Output class for text-to-video pipelines.

-     Args:
+    Args:
         frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
             List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
             denoised
@@ -347,11 +347,7 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
                otherwise a tuple is returned where the first element is the sample tensor.
        """

-        if (
-            isinstance(timestep, int)
-            or isinstance(timestep, torch.IntTensor)
-            or isinstance(timestep, torch.LongTensor)
-        ):
+        if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
            raise ValueError(
                (
                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
@@ -310,11 +310,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
                returned, otherwise a tuple is returned where the first element is the sample tensor.
        """

-        if (
-            isinstance(timestep, int)
-            or isinstance(timestep, torch.IntTensor)
-            or isinstance(timestep, torch.LongTensor)
-        ):
+        if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
            raise ValueError(
                (
                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
@@ -375,11 +375,7 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):

        """

-        if (
-            isinstance(timestep, int)
-            or isinstance(timestep, torch.IntTensor)
-            or isinstance(timestep, torch.LongTensor)
-        ):
+        if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
            raise ValueError(
                (
                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
@@ -530,11 +530,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
                returned, otherwise a tuple is returned where the first element is the sample tensor.
        """

-        if (
-            isinstance(timestep, int)
-            or isinstance(timestep, torch.IntTensor)
-            or isinstance(timestep, torch.LongTensor)
-        ):
+        if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
            raise ValueError(
                (
                    "Passing integer indices (e.g. from `enumerate(timesteps)`) as timesteps to"
@@ -58,20 +58,25 @@ from .import_utils import (
    get_objects_from_module,
    is_accelerate_available,
    is_accelerate_version,
+    is_bitsandbytes_available,
    is_bs4_available,
    is_flax_available,
    is_ftfy_available,
+    is_google_colab,
    is_inflect_available,
    is_invisible_watermark_available,
    is_k_diffusion_available,
    is_k_diffusion_version,
    is_librosa_available,
    is_note_seq_available,
+    is_notebook,
    is_onnx_available,
    is_peft_available,
    is_peft_version,
+    is_safetensors_available,
    is_scipy_available,
    is_tensorboard_available,
+    is_timm_available,
    is_torch_available,
    is_torch_npu_available,
    is_torch_version,
@@ -295,6 +295,39 @@ try:
 except importlib_metadata.PackageNotFoundError:
    _torchvision_available = False

+_timm_available = importlib.util.find_spec("timm") is not None
+if _timm_available:
+    try:
+        _timm_version = importlib_metadata.version("timm")
+        logger.info(f"Timm version {_timm_version} available.")
+    except importlib_metadata.PackageNotFoundError:
+        _timm_available = False
+
+
+def is_timm_available():
+    return _timm_available
+
+
+_bitsandbytes_available = importlib.util.find_spec("bitsandbytes") is not None
+try:
+    _bitsandbytes_version = importlib_metadata.version("bitsandbytes")
+    logger.debug(f"Successfully imported bitsandbytes version {_bitsandbytes_version}")
+except importlib_metadata.PackageNotFoundError:
+    _bitsandbytes_available = False
+
+# Taken from `huggingface_hub`.
+_is_notebook = False
+try:
+    shell_class = get_ipython().__class__  # type: ignore # noqa: F821
+    for parent_class in shell_class.__mro__:  # e.g. "is subclass of"
+        if parent_class.__name__ == "ZMQInteractiveShell":
+            _is_notebook = True  # Jupyter notebook, Google colab or qtconsole
+            break
+except NameError:
+    pass  # Probably standard Python interpreter
+
+_is_google_colab = "google.colab" in sys.modules
+

 def is_torch_available():
    return _torch_available
@@ -392,6 +425,22 @@ def is_torchvision_available():
    return _torchvision_available


+def is_safetensors_available():
+    return _safetensors_available
+
+
+def is_bitsandbytes_available():
+    return _bitsandbytes_available
+
+
+def is_notebook():
+    return _is_notebook
+
+
+def is_google_colab():
+    return _is_google_colab
+
+
 # docstyle-ignore
 FLAX_IMPORT_ERROR = """
 {0} requires the FLAX library but it was not found in your environment. Checkout the instructions on the
@@ -499,6 +548,20 @@ INVISIBLE_WATERMARK_IMPORT_ERROR = """
 {0} requires the invisible-watermark library but it was not found in your environment. You can install it with pip: `pip install invisible-watermark>=0.2.0`
 """

+# docstyle-ignore
+PEFT_IMPORT_ERROR = """
+{0} requires the peft library but it was not found in your environment. You can install it with pip: `pip install peft`
+"""
+
+# docstyle-ignore
+SAFETENSORS_IMPORT_ERROR = """
+{0} requires the safetensors library but it was not found in your environment. You can install it with pip: `pip install safetensors`
+"""
+
+# docstyle-ignore
+BITSANDBYTES_IMPORT_ERROR = """
+{0} requires the bitsandbytes library but it was not found in your environment. You can install it with pip: `pip install bitsandbytes`
+"""

 BACKENDS_MAPPING = OrderedDict(
    [
@@ -520,6 +583,9 @@ BACKENDS_MAPPING = OrderedDict(
        ("ftfy", (is_ftfy_available, FTFY_IMPORT_ERROR)),
        ("torchsde", (is_torchsde_available, TORCHSDE_IMPORT_ERROR)),
        ("invisible_watermark", (is_invisible_watermark_available, INVISIBLE_WATERMARK_IMPORT_ERROR)),
+        ("peft", (is_peft_available, PEFT_IMPORT_ERROR)),
+        ("safetensors", (is_safetensors_available, SAFETENSORS_IMPORT_ERROR)),
+        ("bitsandbytes", (is_bitsandbytes_available, BITSANDBYTES_IMPORT_ERROR)),
    ]
 )

@@ -33,6 +33,7 @@ from .import_utils import (
    is_onnx_available,
    is_opencv_available,
    is_peft_available,
+    is_timm_available,
    is_torch_available,
    is_torch_version,
    is_torchsde_available,
@@ -340,6 +341,13 @@ def require_peft_backend(test_case):
    return unittest.skipUnless(USE_PEFT_BACKEND, "test requires PEFT backend")(test_case)


+def require_timm(test_case):
+    """
+    Decorator marking a test that requires timm. These tests are skipped when timm isn't installed.
+    """
+    return unittest.skipUnless(is_timm_available(), "test requires timm")(test_case)
+
+
 def require_peft_version_greater(peft_version):
    """
    Decorator marking a test that requires PEFT backend with a specific version, this would require some specific
@@ -30,17 +30,19 @@ class VideoProcessor(VaeImageProcessor):
        Preprocesses input video(s).

        Args:
-            video: The input video. It can be one of the following:
+            video (`List[PIL.Image]`, `List[List[PIL.Image]]`, `torch.Tensor`, `np.array`, `List[torch.Tensor]`, `List[np.array]`):
+                The input video. It can be one of the following:
                * List of the PIL images.
                * List of list of PIL images.
-                * 4D Torch tensors (expected shape for each tensor: (num_frames, num_channels, height, width)).
-                * 4D NumPy arrays (expected shape for each array: (num_frames, height, width, num_channels)).
-                * List of 4D Torch tensors (expected shape for each tensor: (num_frames, num_channels, height, width)).
-                * List of 4D NumPy arrays (expected shape for each array: (num_frames, height, width, num_channels)).
-                * 5D NumPy arrays: expected shape for each array: (batch_size, num_frames, height, width,
-                  num_channels).
-                * 5D Torch tensors: expected shape for each array: (batch_size, num_frames, num_channels, height,
-                  width).
+                * 4D Torch tensors (expected shape for each tensor `(num_frames, num_channels, height, width)`).
+                * 4D NumPy arrays (expected shape for each array `(num_frames, height, width, num_channels)`).
+                * List of 4D Torch tensors (expected shape for each tensor `(num_frames, num_channels, height,
+                  width)`).
+                * List of 4D NumPy arrays (expected shape for each array `(num_frames, height, width, num_channels)`).
+                * 5D NumPy arrays: expected shape for each array `(batch_size, num_frames, height, width,
+                  num_channels)`.
+                * 5D Torch tensors: expected shape for each array `(batch_size, num_frames, num_channels, height,
+                  width)`.
            height (`int`, *optional*, defaults to `None`):
                The height in preprocessed frames of the video. If `None`, will use the `get_default_height_width()` to
                get default height.
@@ -224,7 +224,7 @@ class LoraSDXLIntegrationTests(unittest.TestCase):
        ).images

        images = images[0, -3:, -3:, -1].flatten()
-        expected = np.array([0.4468, 0.4087, 0.4134, 0.366, 0.3202, 0.3505, 0.3786, 0.387, 0.3535])
+        expected = np.array([00.4468, 0.4061, 0.4134, 0.3637, 0.3202, 0.365, 0.3786, 0.3725, 0.3535])

        max_diff = numpy_cosine_similarity_distance(expected, images)
        assert max_diff < 1e-4
@@ -507,13 +507,12 @@ class LoraSDXLIntegrationTests(unittest.TestCase):
        image = load_image(
            "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/bird_canny.png"
        )
-
        images = pipe(prompt, image=image, generator=generator, output_type="np", num_inference_steps=3).images

        assert images[0].shape == (768, 512, 3)

        original_image = images[0, -3:, -3:, -1].flatten()
-        expected_image = np.array([0.4574, 0.4461, 0.4435, 0.4462, 0.4396, 0.439, 0.4474, 0.4486, 0.4333])
+        expected_image = np.array([0.4574, 0.4487, 0.4435, 0.5163, 0.4396, 0.4411, 0.518, 0.4465, 0.4333])

        max_diff = numpy_cosine_similarity_distance(expected_image, original_image)
        assert max_diff < 1e-4
@@ -98,3 +98,19 @@ class VQModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
        expected_output_slice = torch.tensor([-0.0153, -0.4044, -0.1880, -0.5161, -0.2418, -0.4072, -0.1612, -0.0633, -0.0143])
        # fmt: on
        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
+
+    def test_loss_pretrained(self):
+        model = VQModel.from_pretrained("fusing/vqgan-dummy")
+        model.to(torch_device).eval()
+
+        torch.manual_seed(0)
+        backend_manual_seed(torch_device, 0)
+
+        image = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
+        image = image.to(torch_device)
+        with torch.no_grad():
+            output = model(image).commit_loss.cpu()
+        # fmt: off
+        expected_output = torch.tensor([0.1936])
+        # fmt: on
+        self.assertTrue(torch.allclose(output, expected_output, atol=1e-3))
@@ -1,191 +0,0 @@
-# coding=utf-8
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import gc
-import unittest
-
-import torch
-
-from diffusers import StableCascadeUNet
-from diffusers.utils import logging
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    numpy_cosine_similarity_distance,
-    require_torch_gpu,
-    slow,
-)
-from diffusers.utils.torch_utils import randn_tensor
-
-
-logger = logging.get_logger(__name__)
-
-enable_full_determinism()
-
-
-@slow
-class StableCascadeUNetModelSlowTests(unittest.TestCase):
-    def tearDown(self) -> None:
-        super().tearDown()
-        gc.collect()
-        torch.cuda.empty_cache()
-
-    def test_stable_cascade_unet_prior_single_file_components(self):
-        single_file_url = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_c_bf16.safetensors"
-        single_file_unet = StableCascadeUNet.from_single_file(single_file_url)
-
-        single_file_unet_config = single_file_unet.config
-        del single_file_unet
-        gc.collect()
-        torch.cuda.empty_cache()
-
-        unet = StableCascadeUNet.from_pretrained("stabilityai/stable-cascade-prior", subfolder="prior", variant="bf16")
-        unet_config = unet.config
-        del unet
-        gc.collect()
-        torch.cuda.empty_cache()
-
-        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
-        for param_name, param_value in single_file_unet_config.items():
-            if param_name in PARAMS_TO_IGNORE:
-                continue
-
-            assert unet_config[param_name] == param_value
-
-    def test_stable_cascade_unet_decoder_single_file_components(self):
-        single_file_url = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b_bf16.safetensors"
-        single_file_unet = StableCascadeUNet.from_single_file(single_file_url)
-
-        single_file_unet_config = single_file_unet.config
-        del single_file_unet
-        gc.collect()
-        torch.cuda.empty_cache()
-
-        unet = StableCascadeUNet.from_pretrained("stabilityai/stable-cascade", subfolder="decoder", variant="bf16")
-        unet_config = unet.config
-        del unet
-        gc.collect()
-        torch.cuda.empty_cache()
-
-        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
-        for param_name, param_value in single_file_unet_config.items():
-            if param_name in PARAMS_TO_IGNORE:
-                continue
-
-            assert unet_config[param_name] == param_value
-
-    def test_stable_cascade_unet_config_loading(self):
-        config = StableCascadeUNet.load_config(
-            pretrained_model_name_or_path="diffusers/stable-cascade-configs", subfolder="prior"
-        )
-        single_file_url = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_c_bf16.safetensors"
-
-        single_file_unet = StableCascadeUNet.from_single_file(single_file_url, config=config)
-        single_file_unet_config = single_file_unet.config
-        del single_file_unet
-        gc.collect()
-        torch.cuda.empty_cache()
-
-        PARAMS_TO_IGNORE = ["torch_dtype", "_name_or_path", "_use_default_values", "_diffusers_version"]
-        for param_name, param_value in config.items():
-            if param_name in PARAMS_TO_IGNORE:
-                continue
-
-            assert single_file_unet_config[param_name] == param_value
-
-    @require_torch_gpu
-    def test_stable_cascade_unet_single_file_prior_forward_pass(self):
-        dtype = torch.bfloat16
-        generator = torch.Generator("cpu")
-
-        model_inputs = {
-            "sample": randn_tensor((1, 16, 24, 24), generator=generator.manual_seed(0)).to("cuda", dtype),
-            "timestep_ratio": torch.tensor([1]).to("cuda", dtype),
-            "clip_text_pooled": randn_tensor((1, 1, 1280), generator=generator.manual_seed(0)).to("cuda", dtype),
-            "clip_text": randn_tensor((1, 77, 1280), generator=generator.manual_seed(0)).to("cuda", dtype),
-            "clip_img": randn_tensor((1, 1, 768), generator=generator.manual_seed(0)).to("cuda", dtype),
-            "pixels": randn_tensor((1, 3, 8, 8), generator=generator.manual_seed(0)).to("cuda", dtype),
-        }
-
-        unet = StableCascadeUNet.from_pretrained(
-            "stabilityai/stable-cascade-prior",
-            subfolder="prior",
-            revision="refs/pr/2",
-            variant="bf16",
-            torch_dtype=dtype,
-        )
-        unet.to("cuda")
-        with torch.no_grad():
-            prior_output = unet(**model_inputs).sample.float().cpu().numpy()
-
-        # Remove UNet from GPU memory before loading the single file UNet model
-        del unet
-        gc.collect()
-        torch.cuda.empty_cache()
-
-        single_file_url = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_c_bf16.safetensors"
-        single_file_unet = StableCascadeUNet.from_single_file(single_file_url, torch_dtype=dtype)
-        single_file_unet.to("cuda")
-        with torch.no_grad():
-            prior_single_file_output = single_file_unet(**model_inputs).sample.float().cpu().numpy()
-
-        # Remove UNet from GPU memory before loading the single file UNet model
-        del single_file_unet
-        gc.collect()
-        torch.cuda.empty_cache()
-
-        max_diff = numpy_cosine_similarity_distance(prior_output.flatten(), prior_single_file_output.flatten())
-        assert max_diff < 8e-3
-
-    @require_torch_gpu
-    def test_stable_cascade_unet_single_file_decoder_forward_pass(self):
-        dtype = torch.float32
-        generator = torch.Generator("cpu")
-
-        model_inputs = {
-            "sample": randn_tensor((1, 4, 256, 256), generator=generator.manual_seed(0)).to("cuda", dtype),
-            "timestep_ratio": torch.tensor([1]).to("cuda", dtype),
-            "clip_text": randn_tensor((1, 77, 1280), generator=generator.manual_seed(0)).to("cuda", dtype),
-            "clip_text_pooled": randn_tensor((1, 1, 1280), generator=generator.manual_seed(0)).to("cuda", dtype),
-            "pixels": randn_tensor((1, 3, 8, 8), generator=generator.manual_seed(0)).to("cuda", dtype),
-        }
-
-        unet = StableCascadeUNet.from_pretrained(
-            "stabilityai/stable-cascade",
-            subfolder="decoder",
-            revision="refs/pr/44",
-            torch_dtype=dtype,
-        )
-        unet.to("cuda")
-        with torch.no_grad():
-            prior_output = unet(**model_inputs).sample.float().cpu().numpy()
-
-        # Remove UNet from GPU memory before loading the single file UNet model
-        del unet
-        gc.collect()
-        torch.cuda.empty_cache()
-
-        single_file_url = "https://huggingface.co/stabilityai/stable-cascade/blob/main/stage_b.safetensors"
-        single_file_unet = StableCascadeUNet.from_single_file(single_file_url, torch_dtype=dtype)
-        single_file_unet.to("cuda")
-        with torch.no_grad():
-            prior_single_file_output = single_file_unet(**model_inputs).sample.float().cpu().numpy()
-
-        # Remove UNet from GPU memory before loading the single file UNet model
-        del single_file_unet
-        gc.collect()
-        torch.cuda.empty_cache()
-
-        max_diff = numpy_cosine_similarity_distance(prior_output.flatten(), prior_single_file_output.flatten())
-        assert max_diff < 1e-4
@@ -19,7 +19,15 @@ import unittest
 import numpy as np
 import torch
 from PIL import Image
-from transformers import CLIPTextConfig, CLIPTextModel, CLIPTextModelWithProjection, CLIPTokenizer
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextConfig,
+    CLIPTextModel,
+    CLIPTextModelWithProjection,
+    CLIPTokenizer,
+    CLIPVisionConfig,
+    CLIPVisionModelWithProjection,
+)

 from diffusers import (
    AutoencoderKL,
@@ -34,6 +42,7 @@ from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor
 from ..pipeline_params import (
    IMAGE_TO_IMAGE_IMAGE_PARAMS,
    TEXT_TO_IMAGE_BATCH_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
    TEXT_TO_IMAGE_IMAGE_PARAMS,
    TEXT_TO_IMAGE_PARAMS,
 )
@@ -55,6 +64,14 @@ class ControlNetPipelineSDXLFastTests(
    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
    image_params = frozenset(IMAGE_TO_IMAGE_IMAGE_PARAMS.union({"mask_image", "control_image"}))
    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union(
+        {
+            "add_text_embeds",
+            "add_time_ids",
+            "mask",
+            "masked_image_latents",
+        }
+    )

    def get_dummy_components(self):
        torch.manual_seed(0)
@@ -129,6 +146,30 @@ class ControlNetPipelineSDXLFastTests(
        text_encoder_2 = CLIPTextModelWithProjection(text_encoder_config)
        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")

+        image_encoder_config = CLIPVisionConfig(
+            hidden_size=32,
+            image_size=224,
+            projection_dim=32,
+            intermediate_size=37,
+            num_attention_heads=4,
+            num_channels=3,
+            num_hidden_layers=5,
+            patch_size=14,
+        )
+
+        image_encoder = CLIPVisionModelWithProjection(image_encoder_config)
+
+        feature_extractor = CLIPImageProcessor(
+            crop_size=224,
+            do_center_crop=True,
+            do_normalize=True,
+            do_resize=True,
+            image_mean=[0.48145466, 0.4578275, 0.40821073],
+            image_std=[0.26862954, 0.26130258, 0.27577711],
+            resample=3,
+            size=224,
+        )
+
        components = {
            "unet": unet,
            "controlnet": controlnet,
@@ -138,6 +179,8 @@ class ControlNetPipelineSDXLFastTests(
            "tokenizer": tokenizer,
            "text_encoder_2": text_encoder_2,
            "tokenizer_2": tokenizer_2,
+            "image_encoder": image_encoder,
+            "feature_extractor": feature_extractor,
        }
        return components

@@ -34,6 +34,7 @@ from ..pipeline_params import (
    IMAGE_TO_IMAGE_IMAGE_PARAMS,
    TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS,
    TEXT_GUIDED_IMAGE_VARIATION_PARAMS,
+    TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS,
 )
 from ..test_pipelines_common import (
    IPAdapterTesterMixin,
@@ -55,9 +56,13 @@ class ControlNetPipelineSDXLImg2ImgFastTests(
 ):
    pipeline_class = StableDiffusionXLControlNetImg2ImgPipeline
    params = TEXT_GUIDED_IMAGE_VARIATION_PARAMS
+    required_optional_params = PipelineTesterMixin.required_optional_params - {"latents"}
    batch_params = TEXT_GUIDED_IMAGE_VARIATION_BATCH_PARAMS
    image_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
    image_latents_params = IMAGE_TO_IMAGE_IMAGE_PARAMS
+    callback_cfg_params = TEXT_TO_IMAGE_CALLBACK_CFG_PARAMS.union(
+        {"add_text_embeds", "add_time_ids", "add_neg_time_ids"}
+    )

    def get_dummy_components(self, skip_first_text_encoder=False):
        torch.manual_seed(0)
@@ -336,7 +336,7 @@ class PixArtSigmaPipelineIntegrationTests(unittest.TestCase):
        image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images

        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array([0.0742, 0.0835, 0.2114, 0.0295, 0.0784, 0.2361, 0.1738, 0.2251, 0.3589])
+        expected_slice = np.array([0.4517, 0.4446, 0.4375, 0.449, 0.4399, 0.4365, 0.4583, 0.4629, 0.4473])

        max_diff = numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice)
        self.assertLessEqual(max_diff, 1e-4)
@@ -344,7 +344,12 @@ class PixArtSigmaPipelineIntegrationTests(unittest.TestCase):
    def test_pixart_512(self):
        generator = torch.Generator("cpu").manual_seed(0)

-        pipe = PixArtSigmaPipeline.from_pretrained(self.ckpt_id_512, torch_dtype=torch.float16)
+        transformer = Transformer2DModel.from_pretrained(
+            self.ckpt_id_512, subfolder="transformer", torch_dtype=torch.float16
+        )
+        pipe = PixArtSigmaPipeline.from_pretrained(
+            self.ckpt_id_1024, transformer=transformer, torch_dtype=torch.float16
+        )
        pipe.enable_model_cpu_offload()

        prompt = self.prompt
@@ -352,7 +357,7 @@ class PixArtSigmaPipelineIntegrationTests(unittest.TestCase):
        image = pipe(prompt, generator=generator, num_inference_steps=2, output_type="np").images

        image_slice = image[0, -3:, -3:, -1]
-        expected_slice = np.array([0.3477, 0.3882, 0.4541, 0.3413, 0.3821, 0.4463, 0.4001, 0.4409, 0.4958])
+        expected_slice = np.array([0.0479, 0.0378, 0.0217, 0.0942, 0.064, 0.0791, 0.2073, 0.1975, 0.2017])

        max_diff = numpy_cosine_similarity_distance(image_slice.flatten(), expected_slice)
        self.assertLessEqual(max_diff, 1e-4)
@@ -394,7 +399,12 @@ class PixArtSigmaPipelineIntegrationTests(unittest.TestCase):
    def test_pixart_512_without_resolution_binning(self):
        generator = torch.manual_seed(0)

-        pipe = PixArtSigmaPipeline.from_pretrained(self.ckpt_id_512, torch_dtype=torch.float16)
+        transformer = Transformer2DModel.from_pretrained(
+            self.ckpt_id_512, subfolder="transformer", torch_dtype=torch.float16
+        )
+        pipe = PixArtSigmaPipeline.from_pretrained(
+            self.ckpt_id_1024, transformer=transformer, torch_dtype=torch.float16
+        )
        pipe.enable_model_cpu_offload()

        prompt = self.prompt
@@ -82,6 +82,7 @@ class StableDiffusionPipelineSingleFileSlowTests(unittest.TestCase, SDSingleFile
        assert pipe.vae.config.scaling_factor == new_scaling_factor


+@slow
 class StableDiffusion21PipelineSingleFileSlowTests(unittest.TestCase, SDSingleFileTesterMixin):
    pipeline_class = StableDiffusionPipeline
    ckpt_path = "https://huggingface.co/stabilityai/stable-diffusion-2-1/blob/main/v2-1_768-ema-pruned.safetensors"
@@ -73,7 +73,7 @@ diffusers_module = spec.loader.load_module()

 # Thanks to https://stackoverflow.com/questions/29916065/how-to-do-camelcase-split-in-python
 def camel_case_split(identifier):
-    "Split a camelcased `identifier` into words."
+    """Split a camelcased `identifier` into words."""
    matches = re.finditer(".+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)", identifier)
    return [m.group(0) for m in matches]
Author	SHA1	Message	Date
Dhruv Nair	c83683fdd1	update	2024-05-24 07:38:13 +00:00
Dhruv Nair	6d6d19d7fc	update	2024-05-23 06:02:02 +00:00
Dhruv Nair	baab065679	Remove unnecessary single file tests for SD Cascade UNet (#7996 ) update	2024-05-22 12:29:59 +05:30
BootesVoid	509741aea7	fix: Attribute error in Logger object (logger.warning) (#8183 )	2024-05-22 12:29:11 +05:30
Lucain	e1df77ee1e	Use HF_TOKEN env var in CI (#7993 )	2024-05-21 14:58:10 +05:30
Steven Liu	fdb1baa05c	[docs] VideoProcessor (#7965 ) * fix? * fix? * fix	2024-05-21 08:18:21 +05:30
Vinh H. Pham	6529ee67ec	Make VAE compatible to torch.compile() (#7984 ) make VAE compatible to torch.compile() Co-authored-by: YiYi Xu <yixu310@gmail.com>	2024-05-20 13:43:59 -04:00
Sai-Suraj-27	df2bc5ef28	fix: Fixed few `docstrings` according to the Google Style Guide (#7717 ) Fixed few docstrings according to the Google Style Guide.	2024-05-20 10:26:05 -07:00
Aleksei Zhuravlev	a7bf77fc28	Passing `cross_attention_kwargs` to `StableDiffusionInstructPix2PixPipeline` (#7961 ) * Update pipeline_stable_diffusion_instruct_pix2pix.py Add `cross_attention_kwargs` to `__call__` method of `StableDiffusionInstructPix2PixPipeline`, which are passed to UNet. * Update documentation for pipeline_stable_diffusion_instruct_pix2pix.py * Update docstring * Update docstring * Fix typing import	2024-05-20 13:14:34 -04:00
Junsong Chen	0f0defdb65	[docs] add doc for PixArtSigmaPipeline (#7857 ) * 1. add doc for PixArtSigmaPipeline; --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: Steven Liu <59462357+stevhliu@users.noreply.github.com> Co-authored-by: Guillaume LEGENDRE <glegendre01@gmail.com> Co-authored-by: Álvaro Somoza <asomoza@users.noreply.github.com> Co-authored-by: Bagheera <59658056+bghira@users.noreply.github.com> Co-authored-by: bghira <bghira@users.github.com> Co-authored-by: Hyoungwon Cho <jhw9811@korea.ac.kr> Co-authored-by: yiyixuxu <yixu310@gmail.com> Co-authored-by: Tolga Cangöz <46008593+standardAI@users.noreply.github.com> Co-authored-by: Philip Pham <phillypham@google.com>	2024-05-20 12:40:57 -04:00
Nikita	19df9f3ec0	Update pipeline_controlnet_inpaint_sd_xl.py (#7983 )	2024-05-20 12:24:49 -04:00
Jacob Marks	d6ca120987	Fix typo in "attention" (#7977 )	2024-05-20 11:54:29 -04:00
Sayak Paul	fb7ae0184f	[tests] fix Pixart Sigma tests (#7966 ) * checking tests * checking ii. * remove prints. * test_pixart_1024 * fix 1024.	2024-05-19 20:56:31 +05:30
Sayak Paul	70f8d4b488	remove unsafe workflow. (#7967 )	2024-05-17 13:46:24 +05:30
Álvaro Somoza	6c60e430ee	Consistent SDXL Controlnet callback tensor inputs (#7958 ) * make _callback_tensor_inputs consistent between sdxl pipelines * forgot this one * fix failing test * fix test_components_function * fix controlnet inpaint tests	2024-05-16 07:15:10 -10:00
Alphin Jain	1221b28eac	Fix AttributeError in train_lcm_distill_lora_sdxl_wds.py (#7923 ) Fix conditional teacher model check in train_lcm_distill_lora_sdxl_wds.py Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-05-16 15:49:54 +05:30
Liang Hou	746f603b20	Fix the text tokenizer name in logger warning of PixArt pipelines (#7912 ) Fix CLIP to T5 in logger warning	2024-05-15 18:49:29 -10:00
Sai-Suraj-27	2afea72d29	refactor: Refactored code by Merging `isinstance` calls (#7710 ) * Merged isinstance calls to make the code simpler. * Corrected formatting errors using ruff. --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: YiYi Xu <yixu310@gmail.com>	2024-05-15 18:33:19 -10:00
Sayak Paul	0f111ab794	[Workflows] add a workflow that can be manually triggered on a PR. (#7942 ) * add a workflow that can be manually triggered on a PR. * remove sudo * add command * small fixes.	2024-05-15 17:18:56 +05:30
Guillaume LEGENDRE	4dd7aaa06f	move to GH hosted M1 runner (#7949 )	2024-05-15 13:47:36 +05:30
Isamu Isozaki	d27e996ccd	Adding VQGAN Training script (#5483 ) * Init commit * Removed einops * Added default movq config for training * Update explanation of prompts * Fixed inheritance of discriminator and init_tracker * Fixed incompatible api between muse and here * Fixed output * Setup init training * Basic structure done * Removed attention for quick tests * Style fixes * Fixed vae/vqgan styles * Removed redefinition of wandb * Fixed log_validation and tqdm * Nothing commit * Added commit loss to lookup_from_codebook * Update src/diffusers/models/vq_model.py Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> * Adding perliminary README * Fixed one typo * Local changes * Fixed main issues * Merging * Update src/diffusers/models/vq_model.py Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> * Testing+Fixed bugs in training script * Some style fixes * Added wandb to docs * Fixed timm test * get testing suite ready. * remove return loss * remove return_loss * Remove diffs * Remove diffs * fix ruff format --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>	2024-05-15 08:47:12 +05:30
Sayak Paul	72780ff5b1	[tests] decorate StableDiffusion21PipelineSingleFileSlowTests with slow. (#7941 ) decorate StableDiffusion21PipelineSingleFileSlowTests with slow.	2024-05-14 14:26:21 -10:00
Jingyang Zhang	69fdb8720f	[Pipeline] Adding BoxDiff to community examples (#7947 ) add boxdiff to community examples	2024-05-14 11:18:29 -10:00
Nikita	b2140a895b	Fix `added_cond_kwargs` when using IP-Adapter in StableDiffusionXLControlNetInpaintPipeline (#7924 ) Fix `added_cond_kwargs` when using IP-Adapter Fix error when using IP-Adapter in pipeline and passing `ip_adapter_image_embeds` instead of `ip_adapter_image` Co-authored-by: YiYi Xu <yixu310@gmail.com>	2024-05-14 10:32:08 -10:00
Sayak Paul	e0e8c58f64	[Core] separate the loading utilities in modeling similar to pipelines. (#7943 ) separate the loading utilities in modeling similar to pipelines.	2024-05-14 22:33:43 +05:30
Sayak Paul	cbea5d1725	update to use hf-workflows for reporting the Docker build statuses (#7938 ) update to use hf-workflows for reporting	2024-05-14 09:25:13 +05:30
Tolga Cangöz	a1245c2c61	Expansion proposal of `diffusers-cli env` (#7403 ) * Expand `diffusers-cli env` * SafeTensors -> Safetensors Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> * Move `safetensors_version = "not installed"` to `else` * Update `safetensors_version` checking * Add GPU detection for Linux, Mac OS, and Windows * Add accelerator detection to environment command * Add is_peft_version to import_utils * Update env.py * Add `huggingface_hub` reference * Add `transformers` reference * Add reference for `huggingface_hub` * Fix print statement in env.py for unusual OS * Up * Fix platform information in env.py * up * Fix import order in env.py * ruff * make style * Fix platform system check in env.py * Fix run method return type in env.py * 🤗 * No need f-string * Remove location info * Remove accelerate config * Refactor env.py to remove accelerate config * feat: Add support for `bitsandbytes` library in environment command --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>	2024-05-14 08:20:24 +05:30
bssrdf	cdda94f412	fix VAE loading issue in train_dreambooth (#7632 ) * fixed vae loading issue #7619 * rerun make style && make quality * bring back model_has_vae and add change \ to / in config_file_name on windows os to make match work * add missing import platform * bring back import model_info * make config_file_name OS independent * switch to using Path.as_posix() to resolve OS dependence * improve style --------- Co-authored-by: Sayak Paul <spsayakpaul@gmail.com> Co-authored-by: bssrdf <bssrdf@gmail.com> Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>	2024-05-14 08:19:53 +05:30
dependabot[bot]	5b830aa356	Bump transformers from 4.36.0 to 4.38.0 in /examples/research_projects/realfill (#7635 ) Bump transformers in /examples/research_projects/realfill Bumps [transformers](https://github.com/huggingface/transformers) from 4.36.0 to 4.38.0. - [Release notes](https://github.com/huggingface/transformers/releases) - [Commits](https://github.com/huggingface/transformers/compare/v4.36.0...v4.38.0) --- updated-dependencies: - dependency-name: transformers dependency-type: direct:production ... Signed-off-by: dependabot[bot] <support@github.com> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>	2024-05-14 08:17:06 +05:30
Kohei	9e7bae9881	Update requirements.txt for text_to_image (#7892 ) Update requirements.txt If the datasets library is old, it will not read the metadata.jsonl and the label will default to an integer of type int. Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-05-14 08:09:12 +05:30
rebel-kblee	b41ce1e090	fix multicontrolnet `save_pretrained` logic for compatibility (#7821 ) fix multicontrolnet save_pretrained logic for compatibility Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-05-13 09:32:06 -10:00
Sayak Paul	95d3748453	[LoRA] Fix LoRA tests (side effects of RGB ordering) part ii (#7932 ) * check * check 2. * update slices	2024-05-13 09:23:48 -10:00
Fabio Rigano	44aa9e566d	fix AnimateDiff creation with a unet loaded with IP Adapter (#7791 ) * Fix loading from_pipe * Fix style --------- Co-authored-by: Dhruv Nair <dhruv.nair@gmail.com>	2024-05-13 08:15:01 -10:00
Álvaro Somoza	fdb05f54ef	Official callbacks (#7761 )	2024-05-12 17:10:29 -10:00
HelloWorldBeginner	98ba18ba55	Add Ascend NPU support for SDXL. (#7916 ) Co-authored-by: mhh001 <mahonghao1@huawei.com> Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-05-12 13:34:23 +02:00
Sayak Paul	5bb38586a9	[Core] fix offload behaviour when device_map is enabled. (#7919 ) fix offload behaviour when device_map is enabled.	2024-05-12 13:29:43 +02:00
Sai-Suraj-27	ec9e88139a	fix: Fixed a wrong link to supported python versions in `contributing.md` file (#7638 ) * Fixed a wrong link to python versions in contributing.md file. * Updated the link to a permalink, so that it will permanently point to the specific line.	2024-05-12 13:21:18 +02:00