update the description of StableDiffusionXLDenoiseLoopWrapper

add to method to modular loader, copied from DiffusionPipeline, not tested yet
add block mappings to modular_diffusers.stable_diffusion_xl.__init__
2025-06-20 07:38:21 +02:00 · 2025-06-20 07:25:20 +02:00 · 2025-06-20 07:24:14 +02:00 · 2025-06-20 07:23:14 +02:00 · 2025-06-19 05:30:18 +02:00 · 2025-06-19 04:45:20 +02:00
298 changed files with 15006 additions and 17667 deletions
@@ -142,7 +142,6 @@ jobs:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
-        RUN_COMPILE: yes
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
@@ -526,60 +525,6 @@ jobs:
          pip install slack_sdk tabulate
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

-  run_nightly_pipeline_level_quantization_tests:
-    name: Torch quantization nightly tests
-    strategy:
-      fail-fast: false
-      max-parallel: 2
-    runs-on:
-      group: aws-g6e-xlarge-plus
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "20gb" --ipc host --gpus 0
-    steps:
-      - name: Checkout diffusers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-      - name: NVIDIA-SMI
-        run: nvidia-smi
-      - name: Install dependencies
-        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          python -m uv pip install -U bitsandbytes optimum_quanto
-          python -m uv pip install pytest-reportlog
-      - name: Environment
-        run: |
-          python utils/print_env.py
-      - name: Pipeline-level quantization tests on GPU
-        env:
-          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
-          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-          CUBLAS_WORKSPACE_CONFIG: :16:8
-          BIG_GPU_MEMORY: 40
-        run: |
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-            --make-reports=tests_pipeline_level_quant_torch_cuda \
-            --report-log=tests_pipeline_level_quant_torch_cuda.log \
-            tests/quantization/test_pipeline_level_quantization.py
-      - name: Failure short reports
-        if: ${{ failure() }}
-        run: |
-          cat reports/tests_pipeline_level_quant_torch_cuda_stats.txt
-          cat reports/tests_pipeline_level_quant_torch_cuda_failures_short.txt
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: torch_cuda_pipeline_level_quant_reports
-          path: reports
-      - name: Generate Report and Notify Channel
-        if: always()
-        run: |
-          pip install slack_sdk tabulate
-          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
-  
 # M1 runner currently not well supported
 # TODO: (Dhruv) add these back when we setup better testing for Apple Silicon
 #  run_nightly_tests_apple_m1:
@@ -11,7 +11,6 @@ on:
      - "tests/**.py"
      - ".github/**.yml"
      - "utils/**.py"
-      - "setup.py"
  push:
    branches:
      - ci-*
@@ -17,8 +17,12 @@
    title: AutoPipeline
  - local: tutorials/basic_training
    title: Train a diffusion model
+  - local: tutorials/using_peft_for_inference
+    title: Load LoRAs for inference
  - local: tutorials/fast_diffusion
    title: Accelerate inference of text-to-image diffusion models
+  - local: tutorials/inference_with_big_models
+    title: Working with big models
  title: Tutorials
 - sections:
  - local: using-diffusers/loading
@@ -29,24 +33,11 @@
    title: Load schedulers and models
  - local: using-diffusers/other-formats
    title: Model files and layouts
+  - local: using-diffusers/loading_adapters
+    title: Load adapters
  - local: using-diffusers/push_to_hub
    title: Push files to the Hub
  title: Load pipelines and adapters
- sections:
-  - local: tutorials/using_peft_for_inference
-    title: LoRA
-  - local: using-diffusers/ip_adapter
-    title: IP-Adapter
-  - local: using-diffusers/controlnet
-    title: ControlNet
-  - local: using-diffusers/t2i_adapter
-    title: T2I-Adapter
-  - local: using-diffusers/dreambooth
-    title: DreamBooth
-  - local: using-diffusers/textual_inversion_inference
-    title: Textual inversion
-  title: Adapters
-  isExpanded: false
 - sections:
  - local: using-diffusers/unconditional_image_generation
    title: Unconditional image generation
@@ -68,6 +59,8 @@
    title: Create a server
  - local: training/distributed_inference
    title: Distributed inference
+  - local: using-diffusers/merge_loras
+    title: Merge LoRAs
  - local: using-diffusers/scheduler_features
    title: Scheduler features
  - local: using-diffusers/callback
@@ -104,12 +97,20 @@
    title: SDXL Turbo
  - local: using-diffusers/kandinsky
    title: Kandinsky
+  - local: using-diffusers/ip_adapter
+    title: IP-Adapter
  - local: using-diffusers/omnigen
    title: OmniGen
  - local: using-diffusers/pag
    title: PAG
+  - local: using-diffusers/controlnet
+    title: ControlNet
+  - local: using-diffusers/t2i_adapter
+    title: T2I-Adapter
  - local: using-diffusers/inference_with_lcm
    title: Latent Consistency Model
+  - local: using-diffusers/textual_inversion_inference
+    title: Textual inversion
  - local: using-diffusers/shap-e
    title: Shap-E
  - local: using-diffusers/diffedit
@@ -179,7 +180,7 @@
  title: Quantization Methods
 - sections:
  - local: optimization/fp16
-    title: Accelerate inference
+    title: Speed up inference
  - local: optimization/memory
    title: Reduce memory usage
  - local: optimization/torch2.0
@@ -295,8 +296,6 @@
        title: CogView4Transformer2DModel
      - local: api/models/consisid_transformer3d
        title: ConsisIDTransformer3DModel
-      - local: api/models/cosmos_transformer3d
-        title: CosmosTransformer3DModel
      - local: api/models/dit_transformer2d
        title: DiTTransformer2DModel
      - local: api/models/easyanimate_transformer3d
@@ -365,8 +364,6 @@
        title: AutoencoderKLAllegro
      - local: api/models/autoencoderkl_cogvideox
        title: AutoencoderKLCogVideoX
-      - local: api/models/autoencoderkl_cosmos
-        title: AutoencoderKLCosmos
      - local: api/models/autoencoder_kl_hunyuan_video
        title: AutoencoderKLHunyuanVideo
      - local: api/models/autoencoderkl_ltx_video
@@ -437,8 +434,6 @@
      title: ControlNet-XS with Stable Diffusion XL
    - local: api/pipelines/controlnet_union
      title: ControlNetUnion
-    - local: api/pipelines/cosmos
-      title: Cosmos
    - local: api/pipelines/dance_diffusion
      title: Dance Diffusion
    - local: api/pipelines/ddim
@@ -457,8 +452,6 @@
      title: Flux
    - local: api/pipelines/control_flux_inpaint
      title: FluxControlInpaint
-    - local: api/pipelines/framepack
-      title: Framepack
    - local: api/pipelines/hidream
      title: HiDream-I1
    - local: api/pipelines/hunyuandit
@@ -575,8 +568,6 @@
      title: UniDiffuser
    - local: api/pipelines/value_guided_sampling
      title: Value-guided sampling
-    - local: api/pipelines/visualcloze
-      title: VisualCloze
    - local: api/pipelines/wan
      title: Wan
    - local: api/pipelines/wuerstchen
@@ -1,40 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# AutoencoderKLCosmos
-
-[Cosmos Tokenizers](https://github.com/NVIDIA/Cosmos-Tokenizer).
-
-Supported models:
- [nvidia/Cosmos-1.0-Tokenizer-CV8x8x8](https://huggingface.co/nvidia/Cosmos-1.0-Tokenizer-CV8x8x8)
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import AutoencoderKLCosmos
-
-vae = AutoencoderKLCosmos.from_pretrained("nvidia/Cosmos-1.0-Tokenizer-CV8x8x8", subfolder="vae")
-```
-
-## AutoencoderKLCosmos
-
-[[autodoc]] AutoencoderKLCosmos
-    - decode
-    - encode
-    - all
-
-## AutoencoderKLOutput
-
-[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
-
-## DecoderOutput
-
-[[autodoc]] models.autoencoders.vae.DecoderOutput
@@ -1,30 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# CosmosTransformer3DModel
-
-A Diffusion Transformer model for 3D video-like data was introduced in [Cosmos World Foundation Model Platform for Physical AI](https://huggingface.co/papers/2501.03575) by NVIDIA.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import CosmosTransformer3DModel
-
-transformer = CosmosTransformer3DModel.from_pretrained("nvidia/Cosmos-1.0-Diffusion-7B-Text2World", subfolder="transformer", torch_dtype=torch.bfloat16)
-```
-
-## CosmosTransformer3DModel
-
-[[autodoc]] CosmosTransformer3DModel
-
-## Transformer2DModelOutput
-
-[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
@@ -21,22 +21,6 @@ from diffusers import HiDreamImageTransformer2DModel
 transformer = HiDreamImageTransformer2DModel.from_pretrained("HiDream-ai/HiDream-I1-Full", subfolder="transformer", torch_dtype=torch.bfloat16)
 ```

-## Loading GGUF quantized checkpoints for HiDream-I1
-
-GGUF checkpoints for the `HiDreamImageTransformer2DModel` can  be loaded using `~FromOriginalModelMixin.from_single_file`
-
-```python
-import torch
-from diffusers import GGUFQuantizationConfig, HiDreamImageTransformer2DModel
-
-ckpt_path = "https://huggingface.co/city96/HiDream-I1-Dev-gguf/blob/main/hidream-i1-dev-Q2_K.gguf"
-transformer = HiDreamImageTransformer2DModel.from_single_file(
-    ckpt_path,
-    quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
-    torch_dtype=torch.bfloat16
-)
-```
-
 ## HiDreamImageTransformer2DModel

 [[autodoc]] HiDreamImageTransformer2DModel
@@ -966,7 +966,7 @@ pipe.to("cuda")
 prompt = {
    0: "A caterpillar on a leaf, high quality, photorealistic",
    40: "A caterpillar transforming into a cocoon, on a leaf, near flowers, photorealistic",
-    80: "A cocoon on a leaf, flowers in the background, photorealistic",
+    80: "A cocoon on a leaf, flowers in the backgrond, photorealistic",
    120: "A cocoon maturing and a butterfly being born, flowers and leaves visible in the background, photorealistic",
    160: "A beautiful butterfly, vibrant colors, sitting on a leaf, flowers in the background, photorealistic",
    200: "A beautiful butterfly, flying away in a forest, photorealistic",
@@ -1,41 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License. -->
-
-# Cosmos
-
-[Cosmos World Foundation Model Platform for Physical AI](https://huggingface.co/papers/2501.03575) by NVIDIA.
-
-*Physical AI needs to be trained digitally first. It needs a digital twin of itself, the policy model, and a digital twin of the world, the world model. In this paper, we present the Cosmos World Foundation Model Platform to help developers build customized world models for their Physical AI setups. We position a world foundation model as a general-purpose world model that can be fine-tuned into customized world models for downstream applications. Our platform covers a video curation pipeline, pre-trained world foundation models, examples of post-training of pre-trained world foundation models, and video tokenizers. To help Physical AI builders solve the most critical problems of our society, we make our platform open-source and our models open-weight with permissive licenses available via https://github.com/NVIDIA/Cosmos.*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## CosmosTextToWorldPipeline
-
-[[autodoc]] CosmosTextToWorldPipeline
-  - all
-  - __call__
-
-## CosmosVideoToWorldPipeline
-
-[[autodoc]] CosmosVideoToWorldPipeline
-  - all
-  - __call__
-
-## CosmosPipelineOutput
-
-[[autodoc]] pipelines.cosmos.pipeline_output.CosmosPipelineOutput
@@ -1,209 +0,0 @@
-<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License. -->
-
-# Framepack
-
-<div class="flex flex-wrap space-x-1">
-  <img alt="LoRA" src="https://img.shields.io/badge/LoRA-d8b4fe?style=flat"/>
-</div>
-
-[Packing Input Frame Context in Next-Frame Prediction Models for Video Generation](https://arxiv.org/abs/2504.12626) by Lvmin Zhang and Maneesh Agrawala.
-
-*We present a neural network structure, FramePack, to train next-frame (or next-frame-section) prediction models for video generation. The FramePack compresses input frames to make the transformer context length a fixed number regardless of the video length. As a result, we are able to process a large number of frames using video diffusion with computation bottleneck similar to image diffusion. This also makes the training video batch sizes significantly higher (batch sizes become comparable to image diffusion training). We also propose an anti-drifting sampling method that generates frames in inverted temporal order with early-established endpoints to avoid exposure bias (error accumulation over iterations). Finally, we show that existing video diffusion models can be finetuned with FramePack, and their visual quality may be improved because the next-frame prediction supports more balanced diffusion schedulers with less extreme flow shift timesteps.*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## Available models
-
-| Model name | Description |
-|:---|:---|
- [`lllyasviel/FramePackI2V_HY`](https://huggingface.co/lllyasviel/FramePackI2V_HY) | Trained with the "inverted anti-drifting" strategy as described in the paper. Inference requires setting `sampling_type="inverted_anti_drifting"` when running the pipeline. |
- [`lllyasviel/FramePack_F1_I2V_HY_20250503`](https://huggingface.co/lllyasviel/FramePack_F1_I2V_HY_20250503) | Trained with a novel anti-drifting strategy but inference is performed in "vanilla" strategy as described in the paper. Inference requires setting `sampling_type="vanilla"` when running the pipeline. |
-
-## Usage
-
-Refer to the pipeline documentation for basic usage examples. The following section contains examples of offloading, different sampling methods, quantization, and more.
-
-### First and last frame to video
-
-The following example shows how to use Framepack with start and end image controls, using the inverted anti-drifiting sampling model.
-
-```python
-import torch
-from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
-from diffusers.utils import export_to_video, load_image
-from transformers import SiglipImageProcessor, SiglipVisionModel
-
-transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
-    "lllyasviel/FramePackI2V_HY", torch_dtype=torch.bfloat16
-)
-feature_extractor = SiglipImageProcessor.from_pretrained(
-    "lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
-)
-image_encoder = SiglipVisionModel.from_pretrained(
-    "lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
-)
-pipe = HunyuanVideoFramepackPipeline.from_pretrained(
-    "hunyuanvideo-community/HunyuanVideo",
-    transformer=transformer,
-    feature_extractor=feature_extractor,
-    image_encoder=image_encoder,
-    torch_dtype=torch.float16,
-)
-
-# Enable memory optimizations
-pipe.enable_model_cpu_offload()
-pipe.vae.enable_tiling()
-
-prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
-first_image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png"
-)
-last_image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png"
-)
-output = pipe(
-    image=first_image,
-    last_image=last_image,
-    prompt=prompt,
-    height=512,
-    width=512,
-    num_frames=91,
-    num_inference_steps=30,
-    guidance_scale=9.0,
-    generator=torch.Generator().manual_seed(0),
-    sampling_type="inverted_anti_drifting",
-).frames[0]
-export_to_video(output, "output.mp4", fps=30)
-```
-
-### Vanilla sampling
-
-The following example shows how to use Framepack with the F1 model trained with vanilla sampling but new regulation approach for anti-drifting.
-
-```python
-import torch
-from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
-from diffusers.utils import export_to_video, load_image
-from transformers import SiglipImageProcessor, SiglipVisionModel
-
-transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
-    "lllyasviel/FramePack_F1_I2V_HY_20250503", torch_dtype=torch.bfloat16
-)
-feature_extractor = SiglipImageProcessor.from_pretrained(
-    "lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
-)
-image_encoder = SiglipVisionModel.from_pretrained(
-    "lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
-)
-pipe = HunyuanVideoFramepackPipeline.from_pretrained(
-    "hunyuanvideo-community/HunyuanVideo",
-    transformer=transformer,
-    feature_extractor=feature_extractor,
-    image_encoder=image_encoder,
-    torch_dtype=torch.float16,
-)
-
-# Enable memory optimizations
-pipe.enable_model_cpu_offload()
-pipe.vae.enable_tiling()
-
-image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png"
-)
-output = pipe(
-    image=image,
-    prompt="A penguin dancing in the snow",
-    height=832,
-    width=480,
-    num_frames=91,
-    num_inference_steps=30,
-    guidance_scale=9.0,
-    generator=torch.Generator().manual_seed(0),
-    sampling_type="vanilla",
-).frames[0]
-export_to_video(output, "output.mp4", fps=30)
-```
-
-### Group offloading
-
-Group offloading ([`~hooks.apply_group_offloading`]) provides aggressive memory optimizations for offloading internal parts of any model to the CPU, with possibly no additional overhead to generation time. If you have very low VRAM available, this approach may be suitable for you depending on the amount of CPU RAM available.
-
-```python
-import torch
-from diffusers import HunyuanVideoFramepackPipeline, HunyuanVideoFramepackTransformer3DModel
-from diffusers.hooks import apply_group_offloading
-from diffusers.utils import export_to_video, load_image
-from transformers import SiglipImageProcessor, SiglipVisionModel
-
-transformer = HunyuanVideoFramepackTransformer3DModel.from_pretrained(
-    "lllyasviel/FramePack_F1_I2V_HY_20250503", torch_dtype=torch.bfloat16
-)
-feature_extractor = SiglipImageProcessor.from_pretrained(
-    "lllyasviel/flux_redux_bfl", subfolder="feature_extractor"
-)
-image_encoder = SiglipVisionModel.from_pretrained(
-    "lllyasviel/flux_redux_bfl", subfolder="image_encoder", torch_dtype=torch.float16
-)
-pipe = HunyuanVideoFramepackPipeline.from_pretrained(
-    "hunyuanvideo-community/HunyuanVideo",
-    transformer=transformer,
-    feature_extractor=feature_extractor,
-    image_encoder=image_encoder,
-    torch_dtype=torch.float16,
-)
-
-# Enable group offloading
-onload_device = torch.device("cuda")
-offload_device = torch.device("cpu")
-list(map(
-    lambda x: apply_group_offloading(x, onload_device, offload_device, offload_type="leaf_level", use_stream=True, low_cpu_mem_usage=True),
-    [pipe.text_encoder, pipe.text_encoder_2, pipe.transformer]
-))
-pipe.image_encoder.to(onload_device)
-pipe.vae.to(onload_device)
-pipe.vae.enable_tiling()
-
-image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/penguin.png"
-)
-output = pipe(
-    image=image,
-    prompt="A penguin dancing in the snow",
-    height=832,
-    width=480,
-    num_frames=91,
-    num_inference_steps=30,
-    guidance_scale=9.0,
-    generator=torch.Generator().manual_seed(0),
-    sampling_type="vanilla",
-).frames[0]
-print(f"Max memory: {torch.cuda.max_memory_allocated() / 1024**3:.3f} GB")
-export_to_video(output, "output.mp4", fps=30)
-```
-
-## HunyuanVideoFramepackPipeline
-
-[[autodoc]] HunyuanVideoFramepackPipeline
-  - all
-  - __call__
-
-## HunyuanVideoPipelineOutput
-
-[[autodoc]] pipelines.hunyuan_video.pipeline_output.HunyuanVideoPipelineOutput
-
@@ -29,7 +29,7 @@ You can find additional information about LEDITS++ on the [project page](https:/
 </Tip>

 <Tip warning={true}>
-Due to some backward compatibility issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion.
+Due to some backward compatability issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion.
 This issue is unlikely to have any noticeable effects on applied use-cases. However, we provide an alternative implementation that guarantees perfect inversion in a dedicated [GitHub repo](https://github.com/ml-research/ledits_pp).
 </Tip>

@@ -31,103 +31,12 @@ Available models:

 |  Model name   | Recommended dtype |
 |:-------------:|:-----------------:|
-| [`LTX Video 2B 0.9.0`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.safetensors) | `torch.bfloat16` |
-| [`LTX Video 2B 0.9.1`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.1.safetensors) | `torch.bfloat16` |
-| [`LTX Video 2B 0.9.5`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.5.safetensors) | `torch.bfloat16` |
-| [`LTX Video 13B 0.9.7`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltxv-13b-0.9.7-dev.safetensors) | `torch.bfloat16` |
-| [`LTX Video Spatial Upscaler 0.9.7`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltxv-spatial-upscaler-0.9.7.safetensors) | `torch.bfloat16` |
+| [`LTX Video 0.9.0`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.safetensors) | `torch.bfloat16` |
+| [`LTX Video 0.9.1`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.1.safetensors) | `torch.bfloat16` |
+| [`LTX Video 0.9.5`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.5.safetensors) | `torch.bfloat16` |

 Note: The recommended dtype is for the transformer component. The VAE and text encoders can be either `torch.float32`, `torch.bfloat16` or `torch.float16` but the recommended dtype is `torch.bfloat16` as used in the original repository.

-## Recommended settings for generation
-
-For the best results, it is recommended to follow the guidelines mentioned in the official LTX Video [repository](https://github.com/Lightricks/LTX-Video).
-
- Some variants of LTX Video are guidance-distilled. For guidance-distilled models, `guidance_scale` must be set to `1.0`. For any other models, `guidance_scale` should be set higher (e.g., `5.0`) for good generation quality.
- For variants with a timestep-aware VAE (LTXV 0.9.1 and above), it is recommended to set `decode_timestep` to `0.05` and `image_cond_noise_scale` to `0.025`.
- For variants that support interpolation between multiple conditioning images and videos (LTXV 0.9.5 and above), it is recommended to use similar looking images/videos for the best results. High divergence between the conditionings may lead to abrupt transitions in the generated video.
-
-## Using LTX Video 13B 0.9.7
-
-LTX Video 0.9.7 comes with a spatial latent upscaler and a 13B parameter transformer. The inference involves generating a low resolution video first, which is very fast, followed by upscaling and refining the generated video.
-
-<!-- TODO(aryan): modify when official checkpoints are available -->
-
-```python
-import torch
-from diffusers import LTXConditionPipeline, LTXLatentUpsamplePipeline
-from diffusers.pipelines.ltx.pipeline_ltx_condition import LTXVideoCondition
-from diffusers.utils import export_to_video, load_video
-
-pipe = LTXConditionPipeline.from_pretrained("a-r-r-o-w/LTX-Video-0.9.7-diffusers", torch_dtype=torch.bfloat16)
-pipe_upsample = LTXLatentUpsamplePipeline.from_pretrained("a-r-r-o-w/LTX-Video-0.9.7-Latent-Spatial-Upsampler-diffusers", vae=pipe.vae, torch_dtype=torch.bfloat16)
-pipe.to("cuda")
-pipe_upsample.to("cuda")
-pipe.vae.enable_tiling()
-
-def round_to_nearest_resolution_acceptable_by_vae(height, width):
-    height = height - (height % pipe.vae_temporal_compression_ratio)
-    width = width - (width % pipe.vae_temporal_compression_ratio)
-    return height, width
-
-video = load_video(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
-)[:21]  # Use only the first 21 frames as conditioning
-condition1 = LTXVideoCondition(video=video, frame_index=0)
-
-prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
-negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
-expected_height, expected_width = 768, 1152
-downscale_factor = 2 / 3
-num_frames = 161
-
-# Part 1. Generate video at smaller resolution
-# Text-only conditioning is also supported without the need to pass `conditions`
-downscaled_height, downscaled_width = int(expected_height * downscale_factor), int(expected_width * downscale_factor)
-downscaled_height, downscaled_width = round_to_nearest_resolution_acceptable_by_vae(downscaled_height, downscaled_width)
-latents = pipe(
-    conditions=[condition1],
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    width=downscaled_width,
-    height=downscaled_height,
-    num_frames=num_frames,
-    num_inference_steps=30,
-    generator=torch.Generator().manual_seed(0),
-    output_type="latent",
-).frames
-
-# Part 2. Upscale generated video using latent upsampler with fewer inference steps
-# The available latent upsampler upscales the height/width by 2x
-upscaled_height, upscaled_width = downscaled_height * 2, downscaled_width * 2
-upscaled_latents = pipe_upsample(
-    latents=latents,
-    output_type="latent"
-).frames
-
-# Part 3. Denoise the upscaled video with few steps to improve texture (optional, but recommended)
-video = pipe(
-    conditions=[condition1],
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    width=upscaled_width,
-    height=upscaled_height,
-    num_frames=num_frames,
-    denoise_strength=0.4,  # Effectively, 4 inference steps out of 10
-    num_inference_steps=10,
-    latents=upscaled_latents,
-    decode_timestep=0.05,
-    image_cond_noise_scale=0.025,
-    generator=torch.Generator().manual_seed(0),
-    output_type="pil",
-).frames[0]
-
-# Part 4. Downscale the video to the expected resolution
-video = [frame.resize((expected_width, expected_height)) for frame in video]
-
-export_to_video(video, "output.mp4", fps=24)
-```
-
 ## Loading Single Files

 Loading the original LTX Video checkpoints is also possible with [`~ModelMixin.from_single_file`]. We recommend using `from_single_file` for the Lightricks series of models, as they plan to release multiple models in the future in the single file format.
@@ -295,12 +204,6 @@ export_to_video(video, "ship.mp4", fps=24)
  - all
  - __call__

-## LTXLatentUpsamplePipeline
-
-[[autodoc]] LTXLatentUpsamplePipeline
-  - all
-  - __call__
-
 ## LTXPipelineOutput

 [[autodoc]] pipelines.ltx.pipeline_output.LTXPipelineOutput
@@ -89,7 +89,6 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
 | [UniDiffuser](unidiffuser) | text2image, image2text, image variation, text variation, unconditional image generation, unconditional audio generation |
 | [Value-guided planning](value_guided_sampling) | value guided sampling |
 | [Wuerstchen](wuerstchen) | text2image |
-| [VisualCloze](visualcloze) | text2image, image2image, subject driven generation, inpainting, style transfer, image restoration, image editing, [depth,normal,edge,pose]2image, [depth,normal,edge,pose]-estimation, virtual try-on, image relighting |

 ## DiffusionPipeline

@@ -1,300 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-->
-
-# VisualCloze
-
-[VisualCloze: A Universal Image Generation Framework via Visual In-Context Learning](https://arxiv.org/abs/2504.07960) is an innovative in-context learning based universal image generation framework that offers key capabilities:
-1. Support for various in-domain tasks
-2. Generalization to unseen tasks through in-context learning
-3. Unify multiple tasks into one step and generate both target image and intermediate results
-4. Support reverse-engineering conditions from target images
-
-## Overview
-
-The abstract from the paper is:
-
-*Recent progress in diffusion models significantly advances various image generation tasks. However, the current mainstream approach remains focused on building task-specific models, which have limited efficiency when supporting a wide range of different needs. While universal models attempt to address this limitation, they face critical challenges, including generalizable task instruction, appropriate task distributions, and unified architectural design. To tackle these challenges, we propose VisualCloze, a universal image generation framework, which supports a wide range of in-domain tasks, generalization to unseen ones, unseen unification of multiple tasks, and reverse generation. Unlike existing methods that rely on language-based task instruction, leading to task ambiguity and weak generalization, we integrate visual in-context learning, allowing models to identify tasks from visual demonstrations. Meanwhile, the inherent sparsity of visual task distributions hampers the learning of transferable knowledge across tasks. To this end, we introduce Graph200K, a graph-structured dataset that establishes various interrelated tasks, enhancing task density and transferable knowledge. Furthermore, we uncover that our unified image generation formulation shared a consistent objective with image infilling, enabling us to leverage the strong generative priors of pre-trained infilling models without modifying the architectures. The codes, dataset, and models are available at https://visualcloze.github.io.*
-
-## Inference
-
-### Model loading
-
-VisualCloze is a two-stage cascade pipeline, containing `VisualClozeGenerationPipeline` and `VisualClozeUpsamplingPipeline`.
- In `VisualClozeGenerationPipeline`, each image is downsampled before concatenating images into a grid layout, avoiding excessively high resolutions. VisualCloze releases two models suitable for diffusers, i.e., [VisualClozePipeline-384](https://huggingface.co/VisualCloze/VisualClozePipeline-384) and [VisualClozePipeline-512](https://huggingface.co/VisualCloze/VisualClozePipeline-384), which downsample images to resolutions of 384 and 512, respectively. 
- `VisualClozeUpsamplingPipeline` uses [SDEdit](https://arxiv.org/abs/2108.01073) to enable high-resolution image synthesis.
-
-The `VisualClozePipeline` integrates both stages to support convenient end-to-end sampling, while also allowing users to utilize each pipeline independently as needed.
-
-### Input Specifications
-
-#### Task and Content Prompts
- Task prompt: Required to describe the generation task intention
- Content prompt: Optional description or caption of the target image
- When content prompt is not needed, pass `None`
- For batch inference, pass `List[str|None]`
-
-#### Image Input Format
- Format: `List[List[Image|None]]`
- Structure:
-  - All rows except the last represent in-context examples
-  - Last row represents the current query (target image set to `None`)
- For batch inference, pass `List[List[List[Image|None]]]`
-
-#### Resolution Control
- Default behavior:
-  - Initial generation in the first stage: area of ${pipe.resolution}^2$
-  - Upsampling in the second stage: 3x factor
- Custom resolution: Adjust using `upsampling_height` and `upsampling_width` parameters
-
-### Examples
-
-For comprehensive examples covering a wide range of tasks, please refer to the [Online Demo](https://huggingface.co/spaces/VisualCloze/VisualCloze) and [GitHub Repository](https://github.com/lzyhha/VisualCloze). Below are simple examples for three cases: mask-to-image conversion, edge detection, and subject-driven generation.
-
-#### Example for mask2image
-
-```python
-import torch
-from diffusers import VisualClozePipeline
-from diffusers.utils import load_image
-
-pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16)
-pipe.to("cuda")
-
-# Load in-context images (make sure the paths are correct and accessible)
-image_paths = [
-    # in-context examples
-    [
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_mask2image_incontext-example-1_mask.jpg'),
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_mask2image_incontext-example-1_image.jpg'),
-    ],
-    # query with the target image
-    [
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_mask2image_query_mask.jpg'),
-        None, # No image needed for the target image
-    ],
-]
-
-# Task and content prompt
-task_prompt = "In each row, a logical task is demonstrated to achieve [IMAGE2] an aesthetically pleasing photograph based on [IMAGE1] sam 2-generated masks with rich color coding."
-content_prompt = """Majestic photo of a golden eagle perched on a rocky outcrop in a mountainous landscape. 
-The eagle is positioned in the right foreground, facing left, with its sharp beak and keen eyes prominently visible. 
-Its plumage is a mix of dark brown and golden hues, with intricate feather details. 
-The background features a soft-focus view of snow-capped mountains under a cloudy sky, creating a serene and grandiose atmosphere. 
-The foreground includes rugged rocks and patches of green moss. Photorealistic, medium depth of field, 
-soft natural lighting, cool color palette, high contrast, sharp focus on the eagle, blurred background, 
-tranquil, majestic, wildlife photography."""
-
-# Run the pipeline
-image_result = pipe(
-    task_prompt=task_prompt,
-    content_prompt=content_prompt,
-    image=image_paths,
-    upsampling_width=1344,
-    upsampling_height=768,
-    upsampling_strength=0.4,
-    guidance_scale=30,
-    num_inference_steps=30,
-    max_sequence_length=512,
-    generator=torch.Generator("cpu").manual_seed(0)
-).images[0][0]
-
-# Save the resulting image
-image_result.save("visualcloze.png")
-```
-
-#### Example for edge-detection
-
-```python
-import torch
-from diffusers import VisualClozePipeline
-from diffusers.utils import load_image
-
-pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16)
-pipe.to("cuda")
-
-# Load in-context images (make sure the paths are correct and accessible)
-image_paths = [
-    # in-context examples
-    [
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_edgedetection_incontext-example-1_image.jpg'),
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_edgedetection_incontext-example-1_edge.jpg'),
-    ],
-    [
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_edgedetection_incontext-example-2_image.jpg'),
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_edgedetection_incontext-example-2_edge.jpg'),
-    ],
-    # query with the target image
-    [
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_edgedetection_query_image.jpg'),
-        None, # No image needed for the target image
-    ],
-]
-
-# Task and content prompt
-task_prompt = "Each row illustrates a pathway from [IMAGE1] a sharp and beautifully composed photograph to [IMAGE2] edge map with natural well-connected outlines using a clear logical task."
-content_prompt = ""
-
-# Run the pipeline
-image_result = pipe(
-    task_prompt=task_prompt,
-    content_prompt=content_prompt,
-    image=image_paths,
-    upsampling_width=864,
-    upsampling_height=1152,
-    upsampling_strength=0.4,
-    guidance_scale=30,
-    num_inference_steps=30,
-    max_sequence_length=512,
-    generator=torch.Generator("cpu").manual_seed(0)
-).images[0][0]
-
-# Save the resulting image
-image_result.save("visualcloze.png")
-```
-
-#### Example for subject-driven generation
-
-```python
-import torch
-from diffusers import VisualClozePipeline
-from diffusers.utils import load_image
-
-pipe = VisualClozePipeline.from_pretrained("VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16)
-pipe.to("cuda")
-
-# Load in-context images (make sure the paths are correct and accessible)
-image_paths = [
-    # in-context examples
-    [
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_subjectdriven_incontext-example-1_reference.jpg'),
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_subjectdriven_incontext-example-1_depth.jpg'),
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_subjectdriven_incontext-example-1_image.jpg'),
-    ],
-    [
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_subjectdriven_incontext-example-2_reference.jpg'),
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_subjectdriven_incontext-example-2_depth.jpg'),
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_subjectdriven_incontext-example-2_image.jpg'),
-    ],
-    # query with the target image
-    [
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_subjectdriven_query_reference.jpg'),
-        load_image('https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_subjectdriven_query_depth.jpg'),
-        None, # No image needed for the target image
-    ],
-]
-
-# Task and content prompt
-task_prompt = """Each row describes a process that begins with [IMAGE1] an image containing the key object, 
-[IMAGE2] depth map revealing gray-toned spatial layers and results in 
-[IMAGE3] an image with artistic qualitya high-quality image with exceptional detail."""
-content_prompt = """A vintage porcelain collector's item. Beneath a blossoming cherry tree in early spring, 
-this treasure is photographed up close, with soft pink petals drifting through the air and vibrant blossoms framing the scene."""
-
-# Run the pipeline
-image_result = pipe(
-    task_prompt=task_prompt,
-    content_prompt=content_prompt,
-    image=image_paths,
-    upsampling_width=1024,
-    upsampling_height=1024,
-    upsampling_strength=0.2,
-    guidance_scale=30,
-    num_inference_steps=30,
-    max_sequence_length=512,
-    generator=torch.Generator("cpu").manual_seed(0)
-).images[0][0]
-
-# Save the resulting image
-image_result.save("visualcloze.png")
-```
-
-#### Utilize each pipeline independently 
-
-```python
-import torch
-from diffusers import VisualClozeGenerationPipeline, FluxFillPipeline as VisualClozeUpsamplingPipeline
-from diffusers.utils import load_image
-from PIL import Image
-
-pipe = VisualClozeGenerationPipeline.from_pretrained(
-    "VisualCloze/VisualClozePipeline-384", resolution=384, torch_dtype=torch.bfloat16
-)
-pipe.to("cuda")
-
-image_paths = [
-    # in-context examples
-    [
-        load_image(
-            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_mask2image_incontext-example-1_mask.jpg"
-        ),
-        load_image(
-            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_mask2image_incontext-example-1_image.jpg"
-        ),
-    ],
-    # query with the target image
-    [
-        load_image(
-            "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/visualcloze/visualcloze_mask2image_query_mask.jpg"
-        ),
-        None,  # No image needed for the target image
-    ],
-]
-task_prompt = "In each row, a logical task is demonstrated to achieve [IMAGE2] an aesthetically pleasing photograph based on [IMAGE1] sam 2-generated masks with rich color coding."
-content_prompt = "Majestic photo of a golden eagle perched on a rocky outcrop in a mountainous landscape. The eagle is positioned in the right foreground, facing left, with its sharp beak and keen eyes prominently visible. Its plumage is a mix of dark brown and golden hues, with intricate feather details. The background features a soft-focus view of snow-capped mountains under a cloudy sky, creating a serene and grandiose atmosphere. The foreground includes rugged rocks and patches of green moss. Photorealistic, medium depth of field, soft natural lighting, cool color palette, high contrast, sharp focus on the eagle, blurred background, tranquil, majestic, wildlife photography."
-
-# Stage 1: Generate initial image
-image = pipe(
-    task_prompt=task_prompt,
-    content_prompt=content_prompt,
-    image=image_paths,
-    guidance_scale=30,
-    num_inference_steps=30,
-    max_sequence_length=512,
-    generator=torch.Generator("cpu").manual_seed(0),
-).images[0][0]
-
-# Stage 2 (optional): Upsample the generated image
-pipe_upsample = VisualClozeUpsamplingPipeline.from_pipe(pipe)
-pipe_upsample.to("cuda")
-
-mask_image = Image.new("RGB", image.size, (255, 255, 255))
-
-image = pipe_upsample(
-    image=image,
-    mask_image=mask_image,
-    prompt=content_prompt,
-    width=1344,
-    height=768,
-    strength=0.4,
-    guidance_scale=30,
-    num_inference_steps=30,
-    max_sequence_length=512,
-    generator=torch.Generator("cpu").manual_seed(0),
-).images[0]
-
-image.save("visualcloze.png")
-```
-
-## VisualClozePipeline
-
-[[autodoc]] VisualClozePipeline
-  - all
-  - __call__
-
-## VisualClozeGenerationPipeline
-
-[[autodoc]] VisualClozeGenerationPipeline
-  - all
-  - __call__
@@ -285,7 +285,7 @@ pipe = WanImageToVideoPipeline.from_pretrained(
    image_encoder=image_encoder,
    torch_dtype=torch.bfloat16
 )
-# Since we've offloaded the larger models already, we can move the rest of the model components to GPU
+# Since we've offloaded the larger models alrady, we can move the rest of the model components to GPU
 pipe.to("cuda")

 image = load_image(
@@ -368,7 +368,7 @@ pipe = WanImageToVideoPipeline.from_pretrained(
    image_encoder=image_encoder,
    torch_dtype=torch.bfloat16
 )
-# Since we've offloaded the larger models already, we can move the rest of the model components to GPU
+# Since we've offloaded the larger models alrady, we can move the rest of the model components to GPU
 pipe.to("cuda")

 image = load_image(
@@ -13,7 +13,9 @@ specific language governing permissions and limitations under the License.

 # Quantization

-Quantization techniques reduce memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference.
+Quantization techniques reduce memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference. Diffusers supports 8-bit and 4-bit quantization with [bitsandbytes](https://huggingface.co/docs/bitsandbytes/en/index).
+
+Quantization techniques that aren't supported in Transformers can be added with the [`DiffusersQuantizer`] class.

 <Tip>

@@ -21,9 +23,6 @@ Learn how to quantize models in the [Quantization](../quantization/overview) gui

 </Tip>

-## PipelineQuantizationConfig
-
-[[autodoc]] quantizers.PipelineQuantizationConfig

 ## BitsAndBytesConfig

@@ -10,211 +10,120 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Accelerate inference
+# Speed up inference

-Diffusion models are slow at inference because generation is an iterative process where noise is gradually refined into an image or video over a certain number of "steps". To speedup this process, you can try experimenting with different [schedulers](../api/schedulers/overview), reduce the precision of the model weights for faster computations, use more memory-efficient attention mechanisms, and more.
+There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times.

-Combine and use these techniques together to make inference faster than using any single technique on its own.
+> [!TIP]
+> Optimizing for inference speed or reduced memory usage can lead to improved performance in the other category, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about lowering memory usage in the [Reduce memory usage](memory) guide.

-This guide will go over how to accelerate inference.
+The inference times below are obtained from generating a single 512x512 image from the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM steps on a NVIDIA A100.

-## Model data type
+| setup    | latency | speed-up |
+|----------|---------|----------|
+| baseline | 5.27s   | x1       |
+| tf32     | 4.14s   | x1.27    |
+| fp16     | 3.51s   | x1.50    |
+| combined | 3.41s   | x1.54    |

-The precision and data type of the model weights affect inference speed because a higher precision requires more memory to load and more time to perform the computations. PyTorch loads model weights in float32 or full precision by default, so changing the data type is a simple way to quickly get faster inference.
+## TensorFloat-32

-<hfoptions id="dtypes">
-<hfoption id="bfloat16">
+On Ampere and later CUDA devices, matrix multiplications and convolutions can use the [TensorFloat-32 (tf32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode for faster, but slightly less accurate computations. By default, PyTorch enables tf32 mode for convolutions but not matrix multiplications. Unless your network requires full float32 precision, we recommend enabling tf32 for matrix multiplications. It can significantly speed up computations with typically negligible loss in numerical accuracy.

-bfloat16 is similar to float16 but it is more robust to numerical errors. Hardware support for bfloat16 varies, but most modern GPUs are capable of supporting bfloat16.
-
-```py
+```python
 import torch
-from diffusers import StableDiffusionXLPipeline
-
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
-).to("cuda")
-
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-pipeline(prompt, num_inference_steps=30).images[0]
-```
-
-</hfoption>
-<hfoption id="float16">
-
-float16 is similar to bfloat16 but may be more prone to numerical errors.
-
-```py
-import torch
-from diffusers import StableDiffusionXLPipeline
-
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-).to("cuda")
-
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-pipeline(prompt, num_inference_steps=30).images[0]
-```
-
-</hfoption>
-<hfoption id="TensorFloat-32">
-
-[TensorFloat-32 (tf32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode is supported on NVIDIA Ampere GPUs and it computes the convolution and matrix multiplication operations in tf32. Storage and other operations are kept in float32. This enables significantly faster computations when combined with bfloat16 or float16.
-
-PyTorch only enables tf32 mode for convolutions by default and you'll need to explicitly enable it for matrix multiplications.
-
-```py
-import torch
-from diffusers import StableDiffusionXLPipeline

 torch.backends.cuda.matmul.allow_tf32 = True
-
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
-).to("cuda")
-
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-pipeline(prompt, num_inference_steps=30).images[0]
 ```

-Refer to the [mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#mixed-precision) docs for more details.
+Learn more about tf32 in the [Mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32) guide.

-</hfoption>
-</hfoptions>
+## Half-precision weights

-## Scaled dot product attention
+To save GPU memory and get more speed, set `torch_dtype=torch.float16` to load and run the model weights directly with half-precision weights.

-> [!TIP]
-> Memory-efficient attention optimizes for inference speed *and* [memory usage](./memory#memory-efficient-attention)!
-
-[Scaled dot product attention (SDPA)](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) implements several attention backends, [FlashAttention](https://github.com/Dao-AILab/flash-attention), [xFormers](https://github.com/facebookresearch/xformers), and a native C++ implementation. It automatically selects the most optimal backend for your hardware.
-
-SDPA is enabled by default if you're using PyTorch >= 2.0 and no additional changes are required to your code. You could try experimenting with other attention backends though if you'd like to choose your own. The example below uses the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to enable efficient attention.
-
-```py
-from torch.nn.attention import SDPBackend, sdpa_kernel
+```Python
 import torch
-from diffusers import StableDiffusionXLPipeline
+from diffusers import DiffusionPipeline

-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
-).to("cuda")
-
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-
-with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
-  image = pipeline(prompt, num_inference_steps=30).images[0]
-```
-
-## torch.compile
-
-[torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) accelerates inference by compiling PyTorch code and operations into optimized kernels. Diffusers typically compiles the more compute-intensive models like the UNet, transformer, or VAE.
-
-Enable the following compiler settings for maximum speed (refer to the [full list](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/config.py) for more options).
-
-```py
-import torch
-from diffusers import StableDiffusionXLPipeline
-
-torch._inductor.config.conv_1x1_as_mm = True
-torch._inductor.config.coordinate_descent_tuning = True
-torch._inductor.config.epilogue_fusion = False
-torch._inductor.config.coordinate_descent_check_all_directions = True
-```
-
-Load and compile the UNet and VAE. There are several different modes you can choose from, but `"max-autotune"` optimizes for the fastest speed by compiling to a CUDA graph. CUDA graphs effectively reduces the overhead by launching multiple GPU operations through a single CPU operation.
-
-> [!TIP]
-> With PyTorch 2.3.1, you can control the caching behavior of torch.compile. This is particularly beneficial for compilation modes like `"max-autotune"` which performs a grid-search over several compilation flags to find the optimal configuration. Learn more in the [Compile Time Caching in torch.compile](https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html) tutorial.
-
-Changing the memory layout to [channels_last](./memory#torchchannels_last) also optimizes memory and inference speed.
-
-```py
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-).to("cuda")
-pipeline.unet.to(memory_format=torch.channels_last)
-pipeline.vae.to(memory_format=torch.channels_last)
-pipeline.unet = torch.compile(
-    pipeline.unet, mode="max-autotune", fullgraph=True
+pipe = DiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
 )
-pipeline.vae.decode = torch.compile(
-    pipeline.vae.decode,
-    mode="max-autotune",
-    fullgraph=True
-)
-
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-pipeline(prompt, num_inference_steps=30).images[0]
+pipe = pipe.to("cuda")
 ```

-Compilation is slow the first time, but once compiled, it is significantly faster. Try to only use the compiled pipeline on the same type of inference operations. Calling the compiled pipeline on a different image size retriggers compilation which is slow and inefficient.
-
-### Graph breaks
-
-It is important to specify `fullgraph=True` in torch.compile to ensure there are no graph breaks in the underlying model. This allows you to take advantage of torch.compile without any performance degradation. For the UNet and VAE, this changes how you access the return variables.
-
-```diff
- latents = unet(
-   latents, timestep=timestep, encoder_hidden_states=prompt_embeds
-).sample
-
-+ latents = unet(
-+   latents, timestep=timestep, encoder_hidden_states=prompt_embeds, return_dict=False
-+)[0]
-```
-
-### GPU sync
-
-The `step()` function is [called](https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py#L1228) on the scheduler each time after the denoiser makes a prediction, and the `sigmas` variable is [indexed](https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/schedulers/scheduling_euler_discrete.py#L476). When placed on the GPU, it introduces latency because of the communication sync between the CPU and GPU. It becomes more evident when the denoiser has already been compiled.
-
-In general, the `sigmas` should [stay on the CPU](https://github.com/huggingface/diffusers/blob/35a969d297cba69110d175ee79c59312b9f49e1e/src/diffusers/schedulers/scheduling_euler_discrete.py#L240) to avoid the communication sync and latency.
-
-## Dynamic quantization
-
-[Dynamic quantization](https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html) improves inference speed by reducing precision to enable faster math operations. This particular type of quantization determines how to scale the activations based on the data at runtime rather than using a fixed scaling factor. As a result, the scaling factor is more accurately aligned with the data.
-
-The example below applies [dynamic int8 quantization](https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html) to the UNet and VAE with the [torchao](../quantization/torchao) library.
-
-> [!TIP]
-> Refer to our [torchao](../quantization/torchao) docs to learn more about how to use the Diffusers torchao integration.
-
-Configure the compiler tags for maximum speed.
-
-```py
-import torch
-from torchao import apply_dynamic_quant
-from diffusers import StableDiffusionXLPipeline
-
-torch._inductor.config.conv_1x1_as_mm = True
-torch._inductor.config.coordinate_descent_tuning = True
-torch._inductor.config.epilogue_fusion = False
-torch._inductor.config.coordinate_descent_check_all_directions = True
-torch._inductor.config.force_fuse_int_mm_with_mul = True
-torch._inductor.config.use_mixed_mm = True
-```
-
-Filter out some linear layers in the UNet and VAE which don't benefit from dynamic quantization with the [dynamic_quant_filter_fn](https://github.com/huggingface/diffusion-fast/blob/0f169640b1db106fe6a479f78c1ed3bfaeba3386/utils/pipeline_utils.py#L16).
-
-```py
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
-).to("cuda")
-
-apply_dynamic_quant(pipeline.unet, dynamic_quant_filter_fn)
-apply_dynamic_quant(pipeline.vae, dynamic_quant_filter_fn)
-
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-pipeline(prompt, num_inference_steps=30).images[0]
-```
-
-## Fused projection matrices
-
 > [!WARNING]
-> The [fuse_qkv_projections](https://github.com/huggingface/diffusers/blob/58431f102cf39c3c8a569f32d71b2ea8caa461e1/src/diffusers/pipelines/pipeline_utils.py#L2034) method is experimental and support is limited to mostly Stable Diffusion pipelines. Take a look at this [PR](https://github.com/huggingface/diffusers/pull/6179) to learn more about how to enable it for other pipelines
+> Don't use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision.

-An input is projected into three subspaces, represented by the projection matrices Q, K, and V, in an attention block. These projections are typically calculated separately, but you can horizontally combine these into a single matrix and perform the projection in a single step. It increases the size of the matrix multiplications of the input projections and also improves the impact of quantization.
+## Distilled model
+
+You could also use a distilled Stable Diffusion model and autoencoder to speed up inference. During distillation, many of the UNet's residual and attention blocks are shed to reduce the model size by 51% and improve latency on CPU/GPU by 43%. The distilled model is faster and uses less memory while generating images of comparable quality to the full Stable Diffusion model.
+
+> [!TIP]
+> Read the [Open-sourcing Knowledge Distillation Code and Weights of SD-Small and SD-Tiny](https://huggingface.co/blog/sd_distillation) blog post to learn more about how knowledge distillation training works to produce a faster, smaller, and cheaper generative model.
+
+The inference times below are obtained from generating 4 images from the prompt "a photo of an astronaut riding a horse on mars" with 25 PNDM steps on a NVIDIA A100. Each generation is repeated 3 times with the distilled Stable Diffusion v1.4 model by [Nota AI](https://hf.co/nota-ai).
+
+| setup                        | latency | speed-up |
+|------------------------------|---------|----------|
+| baseline                     | 6.37s   | x1       |
+| distilled                    | 4.18s   | x1.52    |
+| distilled + tiny autoencoder | 3.83s   | x1.66    |
+
+Let's load the distilled Stable Diffusion model and compare it against the original Stable Diffusion model.

 ```py
-pipeline.fuse_qkv_projections()
-```
+from diffusers import StableDiffusionPipeline
+import torch
+
+distilled = StableDiffusionPipeline.from_pretrained(
+    "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
+).to("cuda")
+prompt = "a golden vase with different flowers"
+generator = torch.manual_seed(2023)
+image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0]
+image
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/original_sd.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original Stable Diffusion</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion</figcaption>
+  </div>
+</div>
+
+### Tiny AutoEncoder
+
+To speed inference up even more, replace the autoencoder with a [distilled version](https://huggingface.co/sayakpaul/taesdxl-diffusers) of it.
+
+```py
+import torch
+from diffusers import AutoencoderTiny, StableDiffusionPipeline
+
+distilled = StableDiffusionPipeline.from_pretrained(
+    "nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
+).to("cuda")
+distilled.vae = AutoencoderTiny.from_pretrained(
+    "sayakpaul/taesd-diffusers", torch_dtype=torch.float16, use_safetensors=True,
+).to("cuda")
+
+prompt = "a golden vase with different flowers"
+generator = torch.manual_seed(2023)
+image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0]
+image
+```
+
+<div class="flex justify-center">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd_vae.png" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder</figcaption>
+  </div>
+</div>
+
+More tiny autoencoder models for other Stable Diffusion models, like Stable Diffusion 3, are available from [madebyollin](https://huggingface.co/madebyollin).
@@ -12,258 +12,178 @@ specific language governing permissions and limitations under the License.

 # Reduce memory usage

-Modern diffusion models like [Flux](../api/pipelines/flux) and [Wan](../api/pipelines/wan) have billions of parameters that take up a lot of memory on your hardware for inference. This is challenging because common GPUs often don't have sufficient memory. To overcome the memory limitations, you can use more than one GPU (if available), offload some of the pipeline components to the CPU, and more.
+A barrier to using diffusion models is the large amount of memory required. To overcome this challenge, there are several memory-reducing techniques you can use to run even some of the largest models on free-tier or consumer GPUs. Some of these techniques can even be combined to further reduce memory usage.

-This guide will show you how to reduce your memory usage. 
+<Tip>

-> [!TIP]
-> Keep in mind these techniques may need to be adjusted depending on the model! For example, a transformer-based diffusion model may not benefit equally from these inference speed optimizations as a UNet-based model.
+In many cases, optimizing for memory or speed leads to improved performance in the other, so you should try to optimize for both whenever you can. This guide focuses on minimizing memory usage, but you can also learn more about how to [Speed up inference](fp16).

-## Multiple GPUs
+</Tip>

-If you have access to more than one GPU, there a few options for efficiently loading and distributing a large model across your hardware. These features are supported by the [Accelerate](https://huggingface.co/docs/accelerate/index) library, so make sure it is installed first.
+The results below are obtained from generating a single 512x512 image from the prompt a photo of an astronaut riding a horse on mars with 50 DDIM steps on a Nvidia Titan RTX, demonstrating the speed-up you can expect as a result of reduced memory consumption.

-```bash
-pip install -U accelerate
-```
+|                  | latency | speed-up |
+| ---------------- | ------- | ------- |
+| original         | 9.50s   | x1      |
+| fp16             | 3.61s   | x2.63   |
+| channels last    | 3.30s   | x2.88   |
+| traced UNet      | 3.21s   | x2.96   |
+| memory-efficient attention  | 2.63s  | x3.61   |

-### Sharded checkpoints
+## Sliced VAE

-Loading large checkpoints in several shards in useful because the shards are loaded one at a time. This keeps memory usage low, only requiring enough memory for the model size and the largest shard size. We recommend sharding when the fp32 checkpoint is greater than 5GB. The default shard size is 5GB.
+Sliced VAE enables decoding large batches of images with limited VRAM or batches with 32 images or more by decoding the batches of latents one image at a time. You'll likely want to couple this with [`~ModelMixin.enable_xformers_memory_efficient_attention`] to reduce memory use further if you have xFormers installed.

-Shard a checkpoint in [`~DiffusionPipeline.save_pretrained`] with the `max_shard_size` parameter.
+To use sliced VAE, call [`~StableDiffusionPipeline.enable_vae_slicing`] on your pipeline before inference:

-```py
-from diffusers import AutoModel
-
-unet = AutoModel.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet"
-)
-unet.save_pretrained("sdxl-unet-sharded", max_shard_size="5GB")
-```
-
-Now you can use the sharded checkpoint, instead of the regular checkpoint, to save memory.
-
-```py
+```python
 import torch
-from diffusers import AutoModel, StableDiffusionXLPipeline
+from diffusers import StableDiffusionPipeline

-unet = AutoModel.from_pretrained(
-    "username/sdxl-unet-sharded", torch_dtype=torch.float16
-)
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    unet=unet,
-    torch_dtype=torch.float16
-).to("cuda")
-```
-
-### Device placement
-
-> [!WARNING]
-> Device placement is an experimental feature and the API may change. Only the `balanced` strategy is supported at the moment. We plan to support additional mapping strategies in the future.
-
-The `device_map` parameter controls how the model components in a pipeline are distributed across devices. The `balanced` device placement strategy evenly splits the pipeline across all available devices.
-
-```py
-import torch
-from diffusers import AutoModel, StableDiffusionXLPipeline
-
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
+pipe = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
-    device_map="balanced"
+    use_safetensors=True,
 )
+pipe = pipe.to("cuda")
+
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_vae_slicing()
+#pipe.enable_xformers_memory_efficient_attention()
+images = pipe([prompt] * 32).images
 ```

-You can inspect a pipeline's device map with `hf_device_map`.
+You may see a small performance boost in VAE decoding on multi-image batches, and there should be no performance impact on single-image batches.

-```py
-print(pipeline.hf_device_map)
-{'unet': 1, 'vae': 1, 'safety_checker': 0, 'text_encoder': 0}
-```
+## Tiled VAE

-The `device_map` parameter also works on the model-level. This is useful for loading large models, such as the Flux diffusion transformer which has 12.5B parameters. Instead of `balanced`, set it to `"auto"` to automatically distribute a model across the fastest device first before moving to slower devices. Refer to the [Model sharding](../training/distributed_inference#model-sharding) docs for more details.
+Tiled VAE processing also enables working with large images on limited VRAM (for example, generating 4k images on 8GB of VRAM) by splitting the image into overlapping tiles, decoding the tiles, and then blending the outputs together to compose the final image. You should also used tiled VAE with [`~ModelMixin.enable_xformers_memory_efficient_attention`] to reduce memory use further if you have xFormers installed.

-```py
+To use tiled VAE processing, call [`~StableDiffusionPipeline.enable_vae_tiling`] on your pipeline before inference:
+
+```python
 import torch
-from diffusers import AutoModel
+from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler

-transformer = AutoModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev", 
-    subfolder="transformer",
-    device_map="auto",
-    torch_dtype=torch.bfloat16
-)
-```
-
-For more fine-grained control, pass a dictionary to enforce the maximum GPU memory to use on each device. If a device is not in `max_memory`, it is ignored and pipeline components won't be distributed to it.
-
-```py
-import torch
-from diffusers import AutoModel, StableDiffusionXLPipeline
-
-max_memory = {0:"1GB", 1:"1GB"}
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
+pipe = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
-    device_map="balanced",
-    max_memory=max_memory
+    use_safetensors=True,
 )
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+pipe = pipe.to("cuda")
+prompt = "a beautiful landscape photograph"
+pipe.enable_vae_tiling()
+#pipe.enable_xformers_memory_efficient_attention()
+
+image = pipe([prompt], width=3840, height=2224, num_inference_steps=20).images[0]
 ```

-Diffusers uses the maxmium memory of all devices by default, but if they don't fit on the GPUs, then you'll need to use a single GPU and offload to the CPU with the methods below.
-
- [`~DiffusionPipeline.enable_model_cpu_offload`] only works on a single GPU but a very large model may not fit on it
- [`~DiffusionPipeline.enable_sequential_cpu_offload`] may work but it is extremely slow and also limited to a single GPU
-
-Use the [`~DiffusionPipeline.reset_device_map`] method to reset the `device_map`. This is necessary if you want to use methods like `.to()`, [`~DiffusionPipeline.enable_sequential_cpu_offload`], and [`~DiffusionPipeline.enable_model_cpu_offload`] on a pipeline that was device-mapped.
-
-```py
-pipeline.reset_device_map()
-```
-
-## VAE slicing
-
-VAE slicing saves memory by splitting large batches of inputs into a single batch of data and separately processing them. This method works best when generating more than one image at a time.
-
-For example, if you're generating 4 images at once, decoding would increase peak activation memory by 4x. VAE slicing reduces this by only decoding 1 image at a time instead of all 4 images at once.
-
-Call [`~StableDiffusionPipeline.enable_vae_slicing`] to enable sliced VAE. You can expect a small increase in performance when decoding multi-image batches and no performance impact for single-image batches.
-
-```py
-import torch
-from diffusers import AutoModel, StableDiffusionXLPipeline
-
-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16,
-).to("cuda")
-pipeline.enable_vae_slicing()
-pipeline(["An astronaut riding a horse on Mars"]*32).images[0]
-print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
-```
-
-> [!WARNING]
-> [`AutoencoderKLWan`] and [`AsymmetricAutoencoderKL`] don't support slicing.
-
-## VAE tiling
-
-VAE tiling saves memory by dividing an image into smaller overlapping tiles instead of processing the entire image at once. This also reduces peak memory usage because the GPU is only processing a tile at a time.
-
-Call [`~StableDiffusionPipeline.enable_vae_tiling`] to enable VAE tiling. The generated image may have some tone variation from tile-to-tile because they're decoded separately, but there shouldn't be any obvious seams between the tiles. Tiling is disabled for resolutions lower than a pre-specified (but configurable) limit. For example, this limit is 512x512 for the VAE in [`StableDiffusionPipeline`].
-
-```py
-import torch
-from diffusers import AutoPipelineForImage2Image
-from diffusers.utils import load_image
-
-pipeline = AutoPipelineForImage2Image.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
-).to("cuda")
-pipeline.enable_vae_tiling()
-
-init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-sdxl-init.png")
-prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
-pipeline(prompt, image=init_image, strength=0.5).images[0]
-print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
-```
-
-> [!WARNING]
-> [`AutoencoderKLWan`] and [`AsymmetricAutoencoderKL`] don't support tiling.
+The output image has some tile-to-tile tone variation because the tiles are decoded separately, but you shouldn't see any sharp and obvious seams between the tiles. Tiling is turned off for images that are 512x512 or smaller.

 ## CPU offloading

-CPU offloading selectively moves weights from the GPU to the CPU. When a component is required, it is transferred to the GPU and when it isn't required, it is moved to the CPU. This method works on submodules rather than whole models. It saves memory by avoiding storing the entire model on the GPU.
+Offloading the weights to the CPU and only loading them on the GPU when performing the forward pass can also save memory. Often, this technique can reduce memory consumption to less than 3GB.

-CPU offloading dramatically reduces memory usage, but it is also **extremely slow** because submodules are passed back and forth multiple times between devices. It can often be impractical due to how slow it is.
+To perform CPU offloading, call [`~StableDiffusionPipeline.enable_sequential_cpu_offload`]:

-> [!WARNING]
-> Don't move the pipeline to CUDA before calling [`~DiffusionPipeline.enable_sequential_cpu_offload`], otherwise the amount of memory saved is only minimal (refer to this [issue](https://github.com/huggingface/diffusers/issues/1934) for more details). This is a stateful operation that installs hooks on the model.
-
-Call [`~DiffusionPipeline.enable_sequential_cpu_offload`] to enable it on a pipeline.
-
-```py
+```Python
 import torch
-from diffusers import DiffusionPipeline
+from diffusers import StableDiffusionPipeline

-pipeline = DiffusionPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16
+pipe = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
 )
-pipeline.enable_sequential_cpu_offload()

-pipeline(
-    prompt="An astronaut riding a horse on Mars",
-    guidance_scale=0.,
-    height=768,
-    width=1360,
-    num_inference_steps=4,
-    max_sequence_length=256,
-).images[0]
-print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_sequential_cpu_offload()
+image = pipe(prompt).images[0]
 ```

+CPU offloading works on submodules rather than whole models. This is the best way to minimize memory consumption, but inference is much slower due to the iterative nature of the diffusion process. The UNet component of the pipeline runs several times (as many as `num_inference_steps`); each time, the different UNet submodules are sequentially onloaded and offloaded as needed, resulting in a large number of memory transfers.
+
+<Tip>
+
+Consider using [model offloading](#model-offloading) if you want to optimize for speed because it is much faster. The tradeoff is your memory savings won't be as large.
+
+</Tip>
+
+<Tip warning={true}>
+
+When using [`~StableDiffusionPipeline.enable_sequential_cpu_offload`], don't move the pipeline to CUDA beforehand or else the gain in memory consumption will only be minimal (see this [issue](https://github.com/huggingface/diffusers/issues/1934) for more information).
+
+[`~StableDiffusionPipeline.enable_sequential_cpu_offload`] is a stateful operation that installs hooks on the models.
+
+</Tip>
+
 ## Model offloading

-Model offloading moves entire models to the GPU instead of selectively moving *some* layers or model components. One of the main pipeline models, usually the text encoder, UNet, and VAE, is placed on the GPU while the other components are held on the CPU. Components like the UNet that run multiple times stays on the GPU until its completely finished and no longer needed. This eliminates the communication overhead of [CPU offloading](#cpu-offloading) and makes model offloading a faster alternative. The tradeoff is memory savings won't be as large.
+<Tip>

-> [!WARNING]
-> Keep in mind that if models are reused outside the pipeline after hookes have been installed (see [Removing Hooks](https://huggingface.co/docs/accelerate/en/package_reference/big_modeling#accelerate.hooks.remove_hook_from_module) for more details), you need to run the entire pipeline and models in the expected order to properly offload them. This is a stateful operation that installs hooks on the model.
+Model offloading requires 🤗 Accelerate version 0.17.0 or higher.

-Call [`~DiffusionPipeline.enable_model_cpu_offload`] to enable it on a pipeline.
+</Tip>

-```py
+[Sequential CPU offloading](#cpu-offloading) preserves a lot of memory but it makes inference slower because submodules are moved to GPU as needed, and they're immediately returned to the CPU when a new module runs.
+
+Full-model offloading is an alternative that moves whole models to the GPU, instead of handling each model's constituent *submodules*. There is a negligible impact on inference time (compared with moving the pipeline to `cuda`), and it still provides some memory savings.
+
+During model offloading, only one of the main components of the pipeline (typically the text encoder, UNet and VAE)
+is placed on the GPU while the others wait on the CPU. Components like the UNet that run for multiple iterations stay on the GPU until they're no longer needed.
+
+Enable model offloading by calling [`~StableDiffusionPipeline.enable_model_cpu_offload`] on the pipeline:
+
+```Python
 import torch
-from diffusers import DiffusionPipeline
+from diffusers import StableDiffusionPipeline

-pipeline = DiffusionPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16
+pipe = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
 )
-pipline.enable_model_cpu_offload()

-pipeline(
-    prompt="An astronaut riding a horse on Mars",
-    guidance_scale=0.,
-    height=768,
-    width=1360,
-    num_inference_steps=4,
-    max_sequence_length=256,
-).images[0]
-print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
+prompt = "a photo of an astronaut riding a horse on mars"
+pipe.enable_model_cpu_offload()
+image = pipe(prompt).images[0]
 ```

-[`~DiffusionPipeline.enable_model_cpu_offload`] also helps when you're using the [`~StableDiffusionXLPipeline.encode_prompt`] method on its own to generate the text encoders hidden state.
+<Tip warning={true}>
+
+In order to properly offload models after they're called, it is required to run the entire pipeline and models are called in the pipeline's expected order. Exercise caution if models are reused outside the context of the pipeline after hooks have been installed. See [Removing Hooks](https://huggingface.co/docs/accelerate/en/package_reference/big_modeling#accelerate.hooks.remove_hook_from_module) for more information.
+
+[`~StableDiffusionPipeline.enable_model_cpu_offload`] is a stateful operation that installs hooks on the models and state on the pipeline.
+
+</Tip>

 ## Group offloading

-Group offloading moves groups of internal layers ([torch.nn.ModuleList](https://pytorch.org/docs/stable/generated/torch.nn.ModuleList.html) or [torch.nn.Sequential](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html)) to the CPU. It uses less memory than [model offloading](#model-offloading) and it is faster than [CPU offloading](#cpu-offloading) because it reduces communication overhead.
+Group offloading is the middle ground between sequential and model offloading. It works by offloading groups of internal layers (either `torch.nn.ModuleList` or `torch.nn.Sequential`), which uses less memory than model-level offloading. It is also faster than sequential-level offloading because the number of device synchronizations is reduced.

-> [!WARNING]
-> Group offloading may not work with all models if the forward implementation contains weight-dependent device casting of inputs because it may clash with group offloading's device casting mechanism.
+To enable group offloading, call the [`~ModelMixin.enable_group_offload`] method on the model if it is a Diffusers model implementation. For any other model implementation, use [`~hooks.group_offloading.apply_group_offloading`]:

-Call [`~ModelMixin.enable_group_offload`] to enable it for standard Diffusers model components that inherit from [`ModelMixin`]. For other model components that don't inherit from [`ModelMixin`], such as a generic [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), use [`~hooks.apply_group_offloading`] instead.
-
-The `offload_type` parameter can be set to `block_level` or `leaf_level`.
-
- `block_level` offloads groups of layers based on the `num_blocks_per_group` parameter. For example, if `num_blocks_per_group=2` on a model with 40 layers, 2 layers are onloaded and offloaded at a time (20 total onloads/offloads). This drastically reduces memory requirements.
- `leaf_level` offloads individual layers at the lowest level and is equivalent to [CPU offloading](#cpu-offloading). But it can be made faster if you use streams without giving up inference speed.
-
-```py
+```python
 import torch
 from diffusers import CogVideoXPipeline
 from diffusers.hooks import apply_group_offloading
 from diffusers.utils import export_to_video

+# Load the pipeline
 onload_device = torch.device("cuda")
 offload_device = torch.device("cpu")
-pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
+pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)

-# Use the enable_group_offload method for Diffusers model implementations
-pipeline.transformer.enable_group_offload(onload_device=onload_device, offload_device=offload_device, offload_type="leaf_level")
-pipeline.vae.enable_group_offload(onload_device=onload_device, offload_type="leaf_level")
+# We can utilize the enable_group_offload method for Diffusers model implementations
+pipe.transformer.enable_group_offload(onload_device=onload_device, offload_device=offload_device, offload_type="leaf_level", use_stream=True)

-# Use the apply_group_offloading method for other model components
-apply_group_offloading(pipeline.text_encoder, onload_device=onload_device, offload_type="block_level", num_blocks_per_group=2)
+# Uncomment the following to also allow recording the current streams.
+# pipe.transformer.enable_group_offload(onload_device=onload_device, offload_device=offload_device, offload_type="leaf_level", use_stream=True, record_stream=True)
+
+# For any other model implementations, the apply_group_offloading function can be used
+apply_group_offloading(pipe.text_encoder, onload_device=onload_device, offload_type="block_level", num_blocks_per_group=2)
+apply_group_offloading(pipe.vae, onload_device=onload_device, offload_type="leaf_level")

 prompt = (
    "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
@@ -273,55 +193,48 @@ prompt = (
    "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
    "atmosphere of this unique musical performance."
 )
-video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
+video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
+# This utilized about 14.79 GB. It can be further reduced by using tiling and using leaf_level offloading throughout the pipeline.
 print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
 export_to_video(video, "output.mp4", fps=8)
 ```

-### CUDA stream
+Group offloading (for CUDA devices with support for asynchronous data transfer streams) overlaps data transfer and computation to reduce the overall execution time compared to sequential offloading. This is enabled using layer prefetching with CUDA streams. The next layer to be executed is loaded onto the accelerator device while the current layer is being executed - this increases the memory requirements slightly. Group offloading also supports leaf-level offloading (equivalent to sequential CPU offloading) but can be made much faster when using streams.

-The `use_stream` parameter can be activated for CUDA devices that support asynchronous data transfer streams to reduce overall execution time compared to [CPU offloading](#cpu-offloading). It overlaps data transfer and computation by using layer prefetching. The next layer to be executed is loaded onto the GPU while the current layer is still being executed. It can increase CPU memory significantly so ensure you have 2x the amount of memory as the model size.
+<Tip>

-Set `record_stream=True` for more of a speedup at the cost of slightly increased memory usage. Refer to the [torch.Tensor.record_stream](https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html) docs to learn more.
+- Group offloading may not work with all models out-of-the-box. If the forward implementations of the model contain weight-dependent device-casting of inputs, it may clash with the offloading mechanism's handling of device-casting.
+- The `offload_type` parameter can be set to either `block_level` or `leaf_level`. `block_level` offloads groups of `torch::nn::ModuleList` or `torch::nn:Sequential` modules based on a configurable attribute `num_blocks_per_group`. For example, if you set `num_blocks_per_group=2` on a standard transformer model containing 40 layers, it will onload/offload 2 layers at a time for a total of 20 onload/offloads. This drastically reduces the VRAM requirements. `leaf_level` offloads individual layers at the lowest level, which is equivalent to sequential offloading. However, unlike sequential offloading, group offloading can be made much faster when using streams, with minimal compromise to end-to-end generation time.
+- The `use_stream` parameter can be used with CUDA devices to enable prefetching layers for onload. It defaults to `False`. Layer prefetching allows overlapping computation and data transfer of model weights, which drastically reduces the overall execution time compared to other offloading methods. However, it can increase the CPU RAM usage significantly. Ensure that available CPU RAM that is at least twice the size of the model when setting `use_stream=True`. You can find more information about CUDA streams [here](https://pytorch.org/docs/stable/generated/torch.cuda.Stream.html)
+- If specifying `use_stream=True` on VAEs with tiling enabled, make sure to do a dummy forward pass (possibly with dummy inputs) before the actual inference to avoid device-mismatch errors. This may not work on all implementations. Please open an issue if you encounter any problems.
+- The parameter `low_cpu_mem_usage` can be set to `True` to reduce CPU memory usage when using streams for group offloading. This is useful when the CPU memory is the bottleneck, but it may counteract the benefits of using streams and increase the overall execution time. The CPU memory savings come from creating pinned-tensors on-the-fly instead of pre-pinning them. This parameter is better suited for using `leaf_level` offloading.
+- When using `use_stream=True`, users can additionally specify `record_stream=True` to get better speedups at the expense of slightly increased memory usage. Refer to the [official PyTorch docs](https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html) to know more about this.

-> [!TIP]
-> When `use_stream=True` on VAEs with tiling enabled, make sure to do a dummy forward pass (possible with dummy inputs as well) before inference to avoid device mismatch errors. This may not work on all implementations, so feel free to open an issue if you encounter any problems.
+For more information about available parameters and an explanation of how group offloading works, refer to [`~hooks.group_offloading.apply_group_offloading`].

-If you're using `block_level` group offloading with `use_stream` enabled, the `num_blocks_per_group` parameter should be set to `1`, otherwise a warning will be raised.
+</Tip>

-```py
-pipeline.transformer.enable_group_offload(onload_device=onload_device, offload_device=offload_device, offload_type="leaf_level", use_stream=True, record_stream=True)
-```
+## FP8 layerwise weight-casting

-The `low_cpu_mem_usage` parameter can be set to `True` to reduce CPU memory usage when using streams during group offloading. It is best for `leaf_level` offloading and when CPU memory is bottlenecked. Memory is saved by creating pinned tensors on the fly instead of pre-pinning them. However, this may increase overall execution time.
+PyTorch supports `torch.float8_e4m3fn` and `torch.float8_e5m2` as weight storage dtypes, but they can't be used for computation in many different tensor operations due to unimplemented kernel support. However, you can use these dtypes to store model weights in fp8 precision and upcast them on-the-fly when the layers are used in the forward pass. This is known as layerwise weight-casting.

-## Layerwise casting
+Typically, inference on most models is done with `torch.float16` or `torch.bfloat16` weight/computation precision. Layerwise weight-casting cuts down the memory footprint of the model weights by approximately half.

-Layerwise casting stores weights in a smaller data format (for example, `torch.float8_e4m3fn` and `torch.float8_e5m2`) to use less memory and upcasts those weights to a higher precision like `torch.float16` or `torch.bfloat16` for computation. Certain layers (normalization and modulation related weights) are skipped because storing them in fp8 can degrade generation quality.
-
-> [!WARNING]
-> Layerwise casting may not work with all models if the forward implementation contains internal typecasting of weights. The current implementation of layerwise casting assumes the forward pass is independent of the weight precision and the input datatypes are always specified in `compute_dtype` (see [here](https://github.com/huggingface/transformers/blob/7f5077e53682ca855afc826162b204ebf809f1f9/src/transformers/models/t5/modeling_t5.py#L294-L299) for an incompatible implementation).
->
-> Layerwise casting may also fail on custom modeling implementations with [PEFT](https://huggingface.co/docs/peft/index) layers. There are some checks available but they are not extensively tested or guaranteed to work in all cases.
-
-Call [`~ModelMixin.enable_layerwise_casting`] to set the storage and computation datatypes.
-
-```py
+```python
 import torch
 from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
 from diffusers.utils import export_to_video

-transformer = CogVideoXTransformer3DModel.from_pretrained(
-    "THUDM/CogVideoX-5b",
-    subfolder="transformer",
-    torch_dtype=torch.bfloat16
-)
+model_id = "THUDM/CogVideoX-5b"
+
+# Load the model in bfloat16 and enable layerwise casting
+transformer = CogVideoXTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.bfloat16)
 transformer.enable_layerwise_casting(storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16)

-pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b",
-    transformer=transformer,
-    torch_dtype=torch.bfloat16
-).to("cuda")
+# Load the pipeline
+pipe = CogVideoXPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch.bfloat16)
+pipe.to("cuda")
+
 prompt = (
    "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
    "The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
@@ -330,53 +243,43 @@ prompt = (
    "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
    "atmosphere of this unique musical performance."
 )
-video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
-print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
+video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
 export_to_video(video, "output.mp4", fps=8)
 ```

-The [`~hooks.apply_layerwise_casting`] method can also be used if you need more control and flexibility. It can be partially applied to model layers by calling it on specific internal modules. Use the `skip_modules_pattern` or `skip_modules_classes` parameters to specify modules to avoid, such as the normalization and modulation layers.
+In the above example, layerwise casting is enabled on the transformer component of the pipeline. By default, certain layers are skipped from the FP8 weight casting because it can lead to significant degradation of generation quality. The normalization and modulation related weight parameters are also skipped by default.
+
+However, you gain more control and flexibility by directly utilizing the [`~hooks.layerwise_casting.apply_layerwise_casting`] function instead of [`~ModelMixin.enable_layerwise_casting`].
+
+<Tip>
+
+- Layerwise casting may not work with all models out-of-the-box. Sometimes, the forward implementations of the model might contain internal typecasting of weight values. Such implementations are not supported due to the currently simplistic implementation of layerwise casting, which assumes that the forward pass is independent of the weight precision and that the input dtypes are always in `compute_dtype`. An example of an incompatible implementation can be found [here](https://github.com/huggingface/transformers/blob/7f5077e53682ca855afc826162b204ebf809f1f9/src/transformers/models/t5/modeling_t5.py#L294-L299).
+- Layerwise casting may fail on custom modeling implementations that make use of [PEFT](https://github.com/huggingface/peft) layers. Some minimal checks to handle this case is implemented but is not extensively tested or guaranteed to work in all cases.
+- It can be also be applied partially to specific layers of a model. Partially applying layerwise casting can either be done manually by calling the `apply_layerwise_casting` function on specific internal modules, or by specifying the `skip_modules_pattern` and `skip_modules_classes` parameters for a root module. These parameters are particularly useful for layers such as normalization and modulation.
+
+</Tip>
+
+## Channels-last memory format
+
+The channels-last memory format is an alternative way of ordering NCHW tensors in memory to preserve dimension ordering. Channels-last tensors are ordered in such a way that the channels become the densest dimension (storing images pixel-per-pixel). Since not all operators currently support the channels-last format, it may result in worst performance but you should still try and see if it works for your model.
+
+For example, to set the pipeline's UNet to use the channels-last format:

 ```python
-import torch
-from diffusers import CogVideoXTransformer3DModel
-from diffusers.hooks import apply_layerwise_casting
-
-transformer = CogVideoXTransformer3DModel.from_pretrained(
-    "THUDM/CogVideoX-5b",
-    subfolder="transformer",
-    torch_dtype=torch.bfloat16
-)
-
-# skip the normalization layer
-apply_layerwise_casting(
-    transformer,
-    storage_dtype=torch.float8_e4m3fn,
-    compute_dtype=torch.bfloat16,
-    skip_modules_classes=["norm"],
-    non_blocking=True,
-)
-```
-
-## torch.channels_last
-
-[torch.channels_last](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html) flips how tensors are stored from `(batch size, channels, height, width)` to `(batch size, heigh, width, channels)`. This aligns the tensors with how the hardware sequentially accesses the tensors stored in memory and avoids skipping around in memory to access the pixel values.
-
-Not all operators currently support the channels-last format and may result in worst performance, but it is still worth trying.
-
-```py
-print(pipeline.unet.conv_out.state_dict()["weight"].stride())  # (2880, 9, 3, 1)
-pipeline.unet.to(memory_format=torch.channels_last)  # in-place operation
+print(pipe.unet.conv_out.state_dict()["weight"].stride())  # (2880, 9, 3, 1)
+pipe.unet.to(memory_format=torch.channels_last)  # in-place operation
 print(
-    pipeline.unet.conv_out.state_dict()["weight"].stride()
+    pipe.unet.conv_out.state_dict()["weight"].stride()
 )  # (2880, 1, 960, 320) having a stride of 1 for the 2nd dimension proves that it works
 ```

-## torch.jit.trace
+## Tracing

-[torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) records the operations a model performs on a sample input and creates a new, optimized representation of the model based on the recorded execution path. During tracing, the model is optimized to reduce overhead from Python and dynamic control flows and operations are fused together for more efficiency. The returned executable or [ScriptFunction](https://pytorch.org/docs/stable/generated/torch.jit.ScriptFunction.html) can be compiled.
+Tracing runs an example input tensor through the model and captures the operations that are performed on it as that input makes its way through the model's layers. The executable or `ScriptFunction` that is returned is optimized with just-in-time compilation.

-```py
+To trace a UNet:
+
+```python
 import time
 import torch
 from diffusers import StableDiffusionPipeline
@@ -389,7 +292,8 @@ torch.set_grad_enabled(False)
 n_experiments = 2
 unet_runs_per_experiment = 50

-# load sample inputs
+
+# load inputs
 def generate_inputs():
    sample = torch.randn((2, 4, 64, 64), device="cuda", dtype=torch.float16)
    timestep = torch.rand(1, device="cuda", dtype=torch.float16) * 999
@@ -397,12 +301,12 @@ def generate_inputs():
    return sample, timestep, encoder_hidden_states


-pipeline = StableDiffusionPipeline.from_pretrained(
+pipe = StableDiffusionPipeline.from_pretrained(
    "stable-diffusion-v1-5/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
    use_safetensors=True,
 ).to("cuda")
-unet = pipeline.unet
+unet = pipe.unet
 unet.eval()
 unet.to(memory_format=torch.channels_last)  # use channels_last memory format
 unet.forward = functools.partial(unet.forward, return_dict=False)  # set return_dict=False as default
@@ -419,12 +323,14 @@ unet_traced = torch.jit.trace(unet, inputs)
 unet_traced.eval()
 print("done tracing")

+
 # warmup and optimize graph
 for _ in range(5):
    with torch.inference_mode():
        inputs = generate_inputs()
        orig_output = unet_traced(*inputs)

+
 # benchmarking
 with torch.inference_mode():
    for _ in range(n_experiments):
@@ -446,18 +352,20 @@ with torch.inference_mode():
 unet_traced.save("unet_traced.pt")
 ```

-Replace the pipeline's UNet with the traced version.
+Replace the `unet` attribute of the pipeline with the traced model:

-```py
-import torch
+```python
 from diffusers import StableDiffusionPipeline
+import torch
 from dataclasses import dataclass

+
@dataclass
 class UNet2DConditionOutput:
    sample: torch.Tensor

-pipeline = StableDiffusionPipeline.from_pretrained(
+
+pipe = StableDiffusionPipeline.from_pretrained(
    "stable-diffusion-v1-5/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
    use_safetensors=True,
@@ -466,7 +374,8 @@ pipeline = StableDiffusionPipeline.from_pretrained(
 # use jitted unet
 unet_traced = torch.jit.load("unet_traced.pt")

-# del pipeline.unet
+
+# del pipe.unet
 class TracedUNet(torch.nn.Module):
    def __init__(self):
        super().__init__()
@@ -477,7 +386,8 @@ class TracedUNet(torch.nn.Module):
        sample = unet_traced(latent_model_input, t, encoder_hidden_states)[0]
        return UNet2DConditionOutput(sample=sample)

-pipeline.unet = TracedUNet()
+
+pipe.unet = TracedUNet()

 with torch.inference_mode():
    image = pipe([prompt] * 1, num_inference_steps=50).images[0]
@@ -485,31 +395,39 @@ with torch.inference_mode():

 ## Memory-efficient attention

-> [!TIP]
-> Memory-efficient attention optimizes for memory usage *and* [inference speed](./fp16#scaled-dot-product-attention!
+Recent work on optimizing bandwidth in the attention block has generated huge speed-ups and reductions in GPU memory usage. The most recent type of memory-efficient attention is [Flash Attention](https://arxiv.org/abs/2205.14135) (you can check out the original code at [HazyResearch/flash-attention](https://github.com/HazyResearch/flash-attention)).

-The Transformers attention mechanism is memory-intensive, especially for long sequences, so you can try using different and more memory-efficient attention types.
+<Tip>

-By default, if PyTorch >= 2.0 is installed, [scaled dot-product attention (SDPA)](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) is used. You don't need to make any additional changes to your code.
+If you have PyTorch >= 2.0 installed, you should not expect a speed-up for inference when enabling `xformers`.

-SDPA supports [FlashAttention](https://github.com/Dao-AILab/flash-attention) and [xFormers](https://github.com/facebookresearch/xformers) as well as a native C++ PyTorch implementation. It automatically selects the most optimal implementation based on your input.
+</Tip>

-You can explicitly use xFormers with the [`~ModelMixin.enable_xformers_memory_efficient_attention`] method.
+To use Flash Attention, install the following:

-```py
-# pip install xformers
+- PyTorch > 1.12
+- CUDA available
+- [xFormers](xformers)
+
+Then call [`~ModelMixin.enable_xformers_memory_efficient_attention`] on the pipeline:
+
+```python
+from diffusers import DiffusionPipeline
 import torch
-from diffusers import StableDiffusionXLPipeline

-pipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
+pipe = DiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
    torch_dtype=torch.float16,
+    use_safetensors=True,
 ).to("cuda")
-pipeline.enable_xformers_memory_efficient_attention()
+
+pipe.enable_xformers_memory_efficient_attention()
+
+with torch.inference_mode():
+    sample = pipe("a small cat")
+
+# optional: You can disable it via
+# pipe.disable_xformers_memory_efficient_attention()
 ```

-Call [`~ModelMixin.disable_xformers_memory_efficient_attention`] to disable it.
-
-```py
-pipeline.disable_xformers_memory_efficient_attention()
-```
+The iteration speed when using `xformers` should match the iteration speed of PyTorch 2.0 as described [here](torch2.0).
@@ -78,23 +78,6 @@ For more information and different options about `torch.compile`, refer to the [
 > [!TIP]
 > Learn more about other ways PyTorch 2.0 can help optimize your model in the [Accelerate inference of text-to-image diffusion models](../tutorials/fast_diffusion) tutorial.

-### Regional compilation
-
-Compiling the whole model usually has a big problem space for optimization. Models are often composed of multiple repeated blocks. [Regional compilation](https://pytorch.org/tutorials/recipes/regional_compilation.html) compiles the repeated block first (a transformer encoder block, for example), so that the Torch compiler would re-use its cached/optimized generated code for the other blocks, reducing (often massively) the cold start compilation time observed on the first inference call.
-
-Enabling regional compilation might require simple yet intrusive changes to the
-modeling code. However, 🤗 Accelerate provides a utility [`compile_regions()`](https://huggingface.co/docs/accelerate/main/en/usage_guides/compilation#how-to-use-regional-compilation) which automatically compiles
-the repeated blocks of the provided `nn.Module` sequentially, and the rest of the model separately. This helps with reducing cold start time while keeping most (if not all) of the speedup you would get from full compilation.
-
-```py
-# Make sure you're on the latest `accelerate`: `pip install -U accelerate`.
-from accelerate.utils import compile_regions
-
-pipe.unet = compile_regions(pipe.unet, mode="reduce-overhead", fullgraph=True)
-```
-
-As you may have noticed `compile_regions()` takes the same arguments as `torch.compile()`, allowing flexibility.
-
 ## Benchmark

 We conducted a comprehensive benchmark with PyTorch 2.0's efficient attention implementation and `torch.compile` across different GPUs and batch sizes for five of our most used pipelines. The code is benchmarked on 🤗 Diffusers v0.17.0.dev0 to optimize `torch.compile` usage (see [here](https://github.com/huggingface/diffusers/pull/3313) for more details).
@@ -48,7 +48,7 @@ For Ada and higher-series GPUs. we recommend changing `torch_dtype` to `torch.bf
 ```py
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
 from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
-import torch
+
 from diffusers import AutoModel
 from transformers import T5EncoderModel

@@ -88,8 +88,6 @@ Setting `device_map="auto"` automatically fills all available space on the GPU(s
 CPU, and finally, the hard drive (the absolute slowest option) if there is still not enough memory.

 ```py
-from diffusers import FluxPipeline
-
 pipe = FluxPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    transformer=transformer_8bit,
@@ -134,7 +132,7 @@ For Ada and higher-series GPUs. we recommend changing `torch_dtype` to `torch.bf
 ```py
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
 from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
-import torch
+
 from diffusers import AutoModel
 from transformers import T5EncoderModel

@@ -173,8 +171,6 @@ Let's generate an image using our quantized models.
 Setting `device_map="auto"` automatically fills all available space on the GPU(s) first, then the CPU, and finally, the hard drive (the absolute slowest option) if there is still not enough memory.

 ```py
-from diffusers import FluxPipeline
-
 pipe = FluxPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    transformer=transformer_4bit,
@@ -218,8 +214,6 @@ Check your memory footprint with the `get_memory_footprint` method:
 print(model.get_memory_footprint())
 ```

-Note that this only tells you the memory footprint of the model params and does _not_ estimate the inference memory requirements.
-
 Quantized models can be loaded from the [`~ModelMixin.from_pretrained`] method without needing to specify the `quantization_config` parameters:

 ```py
@@ -419,4 +413,4 @@ transformer_4bit.dequantize()
 ## Resources

 * [End-to-end notebook showing Flux.1 Dev inference in a free-tier Colab](https://gist.github.com/sayakpaul/c76bd845b48759e11687ac550b99d8b4)
-* [Training](https://github.com/huggingface/diffusers/blob/8c661ea586bf11cb2440da740dd3c4cf84679b85/examples/dreambooth/README_hidream.md#using-quantization)
+* [Training](https://gist.github.com/sayakpaul/05afd428bc089b47af7c016e42004527)
@@ -39,90 +39,3 @@ Diffusers currently supports the following quantization methods.
 - [Quanto](./quanto.md)

 [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques.
-
-## Pipeline-level quantization
-
-Diffusers allows users to directly initialize pipelines from checkpoints that may contain quantized models ([example](https://huggingface.co/hf-internal-testing/flux.1-dev-nf4-pkg)). However, users may want to apply
-quantization on-the-fly when initializing a pipeline from a pre-trained and non-quantized checkpoint. You can
-do this with [`~quantizers.PipelineQuantizationConfig`].
-
-Start by defining a `PipelineQuantizationConfig`:
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-from diffusers.quantizers.quantization_config import QuantoConfig
-from diffusers.quantizers import PipelineQuantizationConfig
-from transformers import BitsAndBytesConfig
-
-pipeline_quant_config = PipelineQuantizationConfig(
-    quant_mapping={
-        "transformer": QuantoConfig(weights_dtype="int8"),
-        "text_encoder_2": BitsAndBytesConfig(
-            load_in_4bit=True, compute_dtype=torch.bfloat16
-        ),
-    }
-)
-```
-
-Then pass it to [`~DiffusionPipeline.from_pretrained`] and run inference:
-
-```py
-pipe = DiffusionPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    quantization_config=pipeline_quant_config,
-    torch_dtype=torch.bfloat16,
-).to("cuda")
-
-image = pipe("photo of a cute dog").images[0]
-```
-
-This method allows for more granular control over the quantization specifications of individual 
-model-level components of a pipeline. It also allows for different quantization backends for
-different components. In the above example, you used a combination of Quanto and BitsandBytes. However,
-one caveat of this method is that users need to know which components come from `transformers` to be able
-to import the right quantization config class.
-
-The other method is simpler in terms of experience but is
-less-flexible. Start by defining a `PipelineQuantizationConfig` but in a different way:
-
-```py
-pipeline_quant_config = PipelineQuantizationConfig(
-    quant_backend="bitsandbytes_4bit",
-    quant_kwargs={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16},
-    components_to_quantize=["transformer", "text_encoder_2"],
-)
-```
-
-This `pipeline_quant_config` can now be passed to [`~DiffusionPipeline.from_pretrained`] similar to the above example.
-
-In this case, `quant_kwargs` will be used to initialize the quantization specifications
-of the respective quantization configuration class of `quant_backend`. `components_to_quantize`
-is used to denote the components that will be quantized. For most pipelines, you would want to
-keep `transformer` in the list as that is often the most compute and memory intensive.
-
-The config below will work for most diffusion pipelines that have a `transformer` component present.
-In most case, you will want to quantize the `transformer` component as that is often the most compute-
-intensive part of a diffusion pipeline.
-
-```py
-pipeline_quant_config = PipelineQuantizationConfig(
-    quant_backend="bitsandbytes_4bit",
-    quant_kwargs={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16},
-    components_to_quantize=["transformer"],
-)
-```
-
-Below is a list of the supported quantization backends available in both `diffusers` and `transformers`:
-
-* `bitsandbytes_4bit` 
-* `bitsandbytes_8bit`
-* `gguf`
-* `quanto`
-* `torchao`
-
-
-Diffusion pipelines can have multiple text encoders. [`FluxPipeline`] has two, for example. It's
-recommended to quantize the text encoders that are memory-intensive. Some examples include T5,
-Llama, Gemma, etc. In the above example, you quantized the T5 model of [`FluxPipeline`] through
-`text_encoder_2` while keeping the CLIP model intact (accessible through `text_encoder`). 
@@ -85,7 +85,7 @@ The quantization methods supported are as follows:
 | **Category** | **Full Function Names** | **Shorthands** |
 |--------------|-------------------------|----------------|
 | **Integer quantization** | `int4_weight_only`, `int8_dynamic_activation_int4_weight`, `int8_weight_only`, `int8_dynamic_activation_int8_weight` | `int4wo`, `int4dq`, `int8wo`, `int8dq` |
-| **Floating point 8-bit quantization** | `float8_weight_only`, `float8_dynamic_activation_float8_weight`, `float8_static_activation_float8_weight` | `float8wo`, `float8wo_e5m2`, `float8wo_e4m3`, `float8dq`, `float8dq_e4m3`, `float8dq_e4m3_tensor`, `float8dq_e4m3_row` |
+| **Floating point 8-bit quantization** | `float8_weight_only`, `float8_dynamic_activation_float8_weight`, `float8_static_activation_float8_weight` | `float8wo`, `float8wo_e5m2`, `float8wo_e4m3`, `float8dq`, `float8dq_e4m3`, `float8_e4m3_tensor`, `float8_e4m3_row` |
 | **Floating point X-bit quantization** | `fpx_weight_only` | `fpX_eAwB` where `X` is the number of bits (1-7), `A` is exponent bits, and `B` is mantissa bits. Constraint: `X == A + B + 1` |
 | **Unsigned Integer quantization** | `uintx_weight_only` | `uint1wo`, `uint2wo`, `uint3wo`, `uint4wo`, `uint5wo`, `uint6wo`, `uint7wo` |

@@ -0,0 +1,139 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Working with big models
+
+A modern diffusion model, like [Stable Diffusion XL (SDXL)](../using-diffusers/sdxl), is not just a single model, but a collection of multiple models. SDXL has four different model-level components:
+
+* A variational autoencoder (VAE)
+* Two text encoders
+* A UNet for denoising
+
+Usually, the text encoders and the denoiser are much larger compared to the VAE.
+
+As models get bigger and better, it’s possible your model is so big that even a single copy won’t fit in memory. But that doesn’t mean it can’t be loaded. If you have more than one GPU, there is more memory available to store your model. In this case, it’s better to split your model checkpoint into several smaller *checkpoint shards*.
+
+When a text encoder checkpoint has multiple shards, like [T5-xxl for SD3](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers/tree/main/text_encoder_3), it is automatically handled by the [Transformers](https://huggingface.co/docs/transformers/index) library as it is a required dependency of Diffusers when using the [`StableDiffusion3Pipeline`]. More specifically, Transformers will automatically handle the loading of multiple shards within the requested model class and get it ready so that inference can be performed.
+
+The denoiser checkpoint can also have multiple shards and supports inference thanks to the [Accelerate](https://huggingface.co/docs/accelerate/index) library.
+
+> [!TIP]
+> Refer to the [Handling big models for inference](https://huggingface.co/docs/accelerate/main/en/concept_guides/big_model_inference) guide for general guidance when working with big models that are hard to fit into memory.
+
+For example, let's save a sharded checkpoint for the [SDXL UNet](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/tree/main/unet):
+
+```python
+from diffusers import AutoModel
+
+unet = AutoModel.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet"
+)
+unet.save_pretrained("sdxl-unet-sharded", max_shard_size="5GB")
+```
+
+The size of the fp32 variant of the SDXL UNet checkpoint is ~10.4GB. Set the `max_shard_size` parameter to 5GB to create 3 shards. After saving, you can load them in [`StableDiffusionXLPipeline`]:
+
+```python
+from diffusers import AutoModel, StableDiffusionXLPipeline
+import torch
+
+unet = AutoModel.from_pretrained(
+    "sayakpaul/sdxl-unet-sharded", torch_dtype=torch.float16
+)
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float16
+).to("cuda")
+
+image = pipeline("a cute dog running on the grass", num_inference_steps=30).images[0]
+image.save("dog.png")
+```
+
+If placing all the model-level components on the GPU at once is not feasible, use [`~DiffusionPipeline.enable_model_cpu_offload`] to help you:
+
+```diff
+- pipeline.to("cuda")
+ pipeline.enable_model_cpu_offload()
+```
+
+In general, we recommend sharding when a checkpoint is more than 5GB (in fp32).
+
+## Device placement
+
+On distributed setups, you can run inference across multiple GPUs with Accelerate.
+
+> [!WARNING]
+> This feature is experimental and its APIs might change in the future.
+
+With Accelerate, you can use the `device_map` to determine how to distribute the models of a pipeline across multiple devices. This is useful in situations where you have more than one GPU.
+
+For example, if you have two 8GB GPUs, then using [`~DiffusionPipeline.enable_model_cpu_offload`] may not work so well because:
+
+* it only works on a single GPU
+* a single model might not fit on a single GPU ([`~DiffusionPipeline.enable_sequential_cpu_offload`] might work but it will be extremely slow and it is also limited to a single GPU)
+
+To make use of both GPUs, you can use the "balanced" device placement strategy which splits the models across all available GPUs.
+
+> [!WARNING]
+> Only the "balanced" strategy is supported at the moment, and we plan to support additional mapping strategies in the future.
+
+```diff
+from diffusers import DiffusionPipeline
+import torch
+
+pipeline = DiffusionPipeline.from_pretrained(
+-    "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True,
+    "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True, device_map="balanced"
+)
+image = pipeline("a dog").images[0]
+image
+```
+
+You can also pass a dictionary to enforce the maximum GPU memory that can be used on each device:
+
+```diff
+from diffusers import DiffusionPipeline
+import torch
+
+max_memory = {0:"1GB", 1:"1GB"}
+pipeline = DiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+    device_map="balanced",
+   max_memory=max_memory
+)
+image = pipeline("a dog").images[0]
+image
+```
+
+If a device is not present in `max_memory`, then it will be completely ignored and will not participate in the device placement.
+
+By default, Diffusers uses the maximum memory of all devices. If the models don't fit on the GPUs, they are offloaded to the CPU. If the CPU doesn't have enough memory, then you might see an error. In that case, you could defer to using [`~DiffusionPipeline.enable_sequential_cpu_offload`] and [`~DiffusionPipeline.enable_model_cpu_offload`].
+
+Call [`~DiffusionPipeline.reset_device_map`] to reset the `device_map` of a pipeline. This is also necessary if you want to use methods like `to()`, [`~DiffusionPipeline.enable_sequential_cpu_offload`], and [`~DiffusionPipeline.enable_model_cpu_offload`] on a pipeline that was device-mapped.
+
+```py
+pipeline.reset_device_map()
+```
+
+Once a pipeline has been device-mapped, you can also access its device map via `hf_device_map`:
+
+```py
+print(pipeline.hf_device_map)
+```
+
+An example device map would look like so:
+
+
+```bash
+{'unet': 1, 'vae': 1, 'safety_checker': 0, 'text_encoder': 0}
+```
@@ -10,625 +10,218 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# LoRA
+[[open-in-colab]]

-[LoRA (Low-Rank Adaptation)](https://huggingface.co/papers/2106.09685) is a method for quickly training a model for a new task. It works by freezing the original model weights and adding a small number of *new* trainable parameters. This means it is significantly faster and cheaper to adapt an existing model to new tasks, such as generating images in a new style.
+# Load LoRAs for inference

-LoRA checkpoints are typically only a couple hundred MBs in size, so they're very lightweight and easy to store. Load these smaller set of weights into an existing base model with [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] and specify the file name.
+There are many adapter types (with [LoRAs](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) being the most popular) trained in different styles to achieve different effects. You can even combine multiple adapters to create new and unique images.

-<hfoptions id="usage">
-<hfoption id="text-to-image">
+In this tutorial, you'll learn how to easily load and manage adapters for inference with the 🤗 [PEFT](https://huggingface.co/docs/peft/index) integration in 🤗 Diffusers. You'll use LoRA as the main adapter technique, so you'll see the terms LoRA and adapter used interchangeably.

-```py
-import torch
-from diffusers import AutoPipelineForText2Image
+Let's first install all the required libraries.

-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/super-cereal-sdxl-lora",
-    weight_name="cereal_box_sdxl_v1.safetensors",
-    adapter_name="cereal"
-)
-pipeline("bears, pizza bites").images[0]
+```bash
+!pip install -q transformers accelerate peft diffusers
 ```

-</hfoption>
-<hfoption id="text-to-video">
+Now, load a pipeline with a [Stable Diffusion XL (SDXL)](../api/pipelines/stable_diffusion/stable_diffusion_xl) checkpoint:

-```py
-import torch
-from diffusers import LTXConditionPipeline
-from diffusers.utils import export_to_video, load_image
-
-pipeline = LTXConditionPipeline.from_pretrained(
-    "Lightricks/LTX-Video-0.9.5", torch_dtype=torch.bfloat16
-)
-
-pipeline.load_lora_weights(
-    "Lightricks/LTX-Video-Cakeify-LoRA",
-    weight_name="ltxv_095_cakeify_lora.safetensors",
-    adapter_name="cakeify"
-)
-pipeline.set_adapters("cakeify")
-
-# use "CAKEIFY" to trigger the LoRA
-prompt = "CAKEIFY a person using a knife to cut a cake shaped like a Pikachu plushie"
-image = load_image("https://huggingface.co/Lightricks/LTX-Video-Cakeify-LoRA/resolve/main/assets/images/pikachu.png")
-
-video = pipeline(
-    prompt=prompt,
-    image=image,
-    width=576,
-    height=576,
-    num_frames=161,
-    decode_timestep=0.03,
-    decode_noise_scale=0.025,
-    num_inference_steps=50,
-).frames[0]
-export_to_video(video, "output.mp4", fps=26)
-```
-
-</hfoption>
-</hfoptions>
-
-The [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method is the preferred way to load LoRA weights into the UNet and text encoder because it can handle cases where:
-
- the LoRA weights don't have separate UNet and text encoder identifiers
- the LoRA weights have separate UNet and text encoder identifiers
-
-The [`~loaders.PeftAdapterMixin.load_lora_adapter`] method is used to directly load a LoRA adapter at the *model-level*, as long as the model is a Diffusers model that is a subclass of [`PeftAdapterMixin`]. It builds and prepares the necessary model configuration for the adapter. This method also loads the LoRA adapter into the UNet.
-
-For example, if you're only loading a LoRA into the UNet, [`~loaders.PeftAdapterMixin.load_lora_adapter`] ignores the text encoder keys. Use the `prefix` parameter to filter and load the appropriate state dicts, `"unet"` to load.
-
-```py
-import torch
-from diffusers import AutoPipelineForText2Image
-
-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.unet.load_lora_adapter(
-    "jbilcke-hf/sdxl-cinematic-1",
-    weight_name="pytorch_lora_weights.safetensors",
-    adapter_name="cinematic"
-    prefix="unet"
-)
-# use cnmt in the prompt to trigger the LoRA
-pipeline("A cute cnmt eating a slice of pizza, stunning color scheme, masterpiece, illustration").images[0]
-```
-
-## torch.compile
-
-[torch.compile](../optimization/torch2.0#torchcompile) speeds up inference by compiling the PyTorch model to use optimized kernels. Before compiling, the LoRA weights need to be fused into the base model and unloaded first.
-
-```py
-import torch
+```python
 from diffusers import DiffusionPipeline
-
-# load base model and LoRA
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/ikea-instructions-lora-sdxl",
-    weight_name="ikea_instructions_xl_v1_5.safetensors",
-    adapter_name="ikea"
-)
-
-# activate LoRA and set adapter weight
-pipeline.set_adapters("ikea", adapter_weights=0.7)
-
-# fuse LoRAs and unload weights
-pipeline.fuse_lora(adapter_names=["ikea"], lora_scale=1.0)
-pipeline.unload_lora_weights()
-```
-
-Typically, the UNet is compiled because its the most compute intensive component of the pipeline.
-
-```py
-pipeline.unet.to(memory_format=torch.channels_last)
-pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
-
-pipeline("A bowl of ramen shaped like a cute kawaii bear").images[0]
-```
-
-Refer to the [hotswapping](#hotswapping) section to learn how to avoid recompilation when working with compiled models and multiple LoRAs.
-
-## Weight scale
-
-The `scale` parameter is used to control how much of a LoRA to apply. A value of `0` is equivalent to only using the base model weights and a value of `1` is equivalent to fully using the LoRA.
-
-<hfoptions id="weight-scale">
-<hfoption id="simple use case">
-
-For simple use cases, you can pass `cross_attention_kwargs={"scale": 1.0}` to the pipeline.
-
-```py
 import torch
-from diffusers import AutoPipelineForText2Image

-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/super-cereal-sdxl-lora",
-    weight_name="cereal_box_sdxl_v1.safetensors",
-    adapter_name="cereal"
-)
-pipeline("bears, pizza bites", cross_attention_kwargs={"scale": 1.0}).images[0]
+pipe_id = "stabilityai/stable-diffusion-xl-base-1.0"
+pipe = DiffusionPipeline.from_pretrained(pipe_id, torch_dtype=torch.float16).to("cuda")
 ```

-</hfoption>
-<hfoption id="finer control">
+Next, load a [CiroN2022/toy-face](https://huggingface.co/CiroN2022/toy-face) adapter with the [`~diffusers.loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] method. With the 🤗 PEFT integration, you can assign a specific `adapter_name` to the checkpoint, which lets you easily switch between different LoRA checkpoints. Let's call this adapter `"toy"`.

-> [!WARNING]
-> The [`~loaders.PeftAdapterMixin.set_adapters`] method only scales attention weights. If a LoRA has ResNets or down and upsamplers, these components keep a scale value of `1.0`.
+```python
+pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
+```

-For finer control over each individual component of the UNet or text encoder, pass a dictionary instead. In the example below, the `"down"` block in the UNet is scaled by 0.9 and you can further specify in the `"up"` block the scales of the transformers in `"block_0"` and `"block_1"`. If a block like `"mid"` isn't specified, the default value `1.0` is used.
+Make sure to include the token `toy_face` in the prompt and then you can perform inference:

-```py
-import torch
-from diffusers import AutoPipelineForText2Image
+```python
+prompt = "toy_face of a hacker with a hoodie"

-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/super-cereal-sdxl-lora",
-    weight_name="cereal_box_sdxl_v1.safetensors",
-    adapter_name="cereal"
-)
-scales = {
-    "text_encoder": 0.5,
-    "text_encoder_2": 0.5,
+lora_scale = 0.9
+image = pipe(
+    prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
+).images[0]
+image
+```
+
+![toy-face](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_8_1.png)
+
+With the `adapter_name` parameter, it is really easy to use another adapter for inference! Load the [nerijs/pixel-art-xl](https://huggingface.co/nerijs/pixel-art-xl) adapter that has been fine-tuned to generate pixel art images and call it `"pixel"`.
+
+The pipeline automatically sets the first loaded adapter (`"toy"`) as the active adapter, but you can activate the `"pixel"` adapter with the [`~loaders.peft.PeftAdapterMixin.set_adapters`] method:
+
+```python
+pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
+pipe.set_adapters("pixel")
+```
+
+Make sure you include the token `pixel art` in your prompt to generate a pixel art image:
+
+```python
+prompt = "a hacker with a hoodie, pixel art"
+image = pipe(
+    prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
+).images[0]
+image
+```
+
+![pixel-art](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_12_1.png)
+
+<Tip>
+
+By default, if the most up-to-date versions of PEFT and Transformers are detected, `low_cpu_mem_usage` is set to `True` to speed up the loading time of LoRA checkpoints. 
+
+</Tip>
+
+## Merge adapters
+
+You can also merge different adapter checkpoints for inference to blend their styles together.
+
+Once again, use the [`~loaders.peft.PeftAdapterMixin.set_adapters`] method to activate the `pixel` and `toy` adapters and specify the weights for how they should be merged.
+
+```python
+pipe.set_adapters(["pixel", "toy"], adapter_weights=[0.5, 1.0])
+```
+
+<Tip>
+
+LoRA checkpoints in the diffusion community are almost always obtained with [DreamBooth](https://huggingface.co/docs/diffusers/main/en/training/dreambooth). DreamBooth training often relies on "trigger" words in the input text prompts in order for the generation results to look as expected. When you combine multiple LoRA checkpoints, it's important to ensure the trigger words for the corresponding LoRA checkpoints are present in the input text prompts.
+
+</Tip>
+
+Remember to use the trigger words for [CiroN2022/toy-face](https://hf.co/CiroN2022/toy-face) and [nerijs/pixel-art-xl](https://hf.co/nerijs/pixel-art-xl) (these are found in their repositories) in the prompt to generate an image.
+
+```python
+prompt = "toy_face of a hacker with a hoodie, pixel art"
+image = pipe(
+    prompt, num_inference_steps=30, cross_attention_kwargs={"scale": 1.0}, generator=torch.manual_seed(0)
+).images[0]
+image
+```
+
+![toy-face-pixel-art](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_16_1.png)
+
+Impressive! As you can see, the model generated an image that mixed the characteristics of both adapters.
+
+> [!TIP]
+> Through its PEFT integration, Diffusers also offers more efficient merging methods which you can learn about in the [Merge LoRAs](../using-diffusers/merge_loras) guide!
+
+To return to only using one adapter, use the [`~loaders.peft.PeftAdapterMixin.set_adapters`] method to activate the `"toy"` adapter:
+
+```python
+pipe.set_adapters("toy")
+
+prompt = "toy_face of a hacker with a hoodie"
+lora_scale = 0.9
+image = pipe(
+    prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
+).images[0]
+image
+```
+
+Or to disable all adapters entirely, use the [`~loaders.peft.PeftAdapterMixin.disable_lora`] method to return the base model.
+
+```python
+pipe.disable_lora()
+
+prompt = "toy_face of a hacker with a hoodie"
+image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
+image
+```
+
+![no-lora](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_20_1.png)
+
+### Customize adapters strength
+
+For even more customization, you can control how strongly the adapter affects each part of the pipeline. For this, pass a dictionary with the control strengths (called "scales") to [`~loaders.peft.PeftAdapterMixin.set_adapters`].
+
+For example, here's how you can turn on the adapter for the `down` parts, but turn it off for the `mid` and `up` parts:
+```python
+pipe.enable_lora()  # enable lora again, after we disabled it above
+prompt = "toy_face of a hacker with a hoodie, pixel art"
+adapter_weight_scales = { "unet": { "down": 1, "mid": 0, "up": 0} }
+pipe.set_adapters("pixel", adapter_weight_scales)
+image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
+image
+```
+
+![block-lora-text-and-down](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_down.png)
+
+Let's see how turning off the `down` part and turning on the `mid` and `up` part respectively changes the image.
+```python
+adapter_weight_scales = { "unet": { "down": 0, "mid": 1, "up": 0} }
+pipe.set_adapters("pixel", adapter_weight_scales)
+image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
+image
+```
+
+![block-lora-text-and-mid](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_mid.png)
+
+```python
+adapter_weight_scales = { "unet": { "down": 0, "mid": 0, "up": 1} }
+pipe.set_adapters("pixel", adapter_weight_scales)
+image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
+image
+```
+
+![block-lora-text-and-up](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_up.png)
+
+Looks cool!
+
+This is a really powerful feature. You can use it to control the adapter strengths down to per-transformer level. And you can even use it for multiple adapters.
+```python
+adapter_weight_scales_toy = 0.5
+adapter_weight_scales_pixel = {
    "unet": {
-        "down": 0.9,
+        "down": 0.9,  # all transformers in the down-part will use scale 0.9
+        # "mid"  # because, in this example, "mid" is not given, all transformers in the mid part will use the default scale 1.0
        "up": {
-            "block_0": 0.6,
-            "block_1": [0.4, 0.8, 1.0],
+            "block_0": 0.6,  # all 3 transformers in the 0th block in the up-part will use scale 0.6
+            "block_1": [0.4, 0.8, 1.0],  # the 3 transformers in the 1st block in the up-part will use scales 0.4, 0.8 and 1.0 respectively
        }
    }
 }
-pipeline.set_adapters("cereal", scales)
-pipeline("bears, pizza bites").images[0]
+pipe.set_adapters(["toy", "pixel"], [adapter_weight_scales_toy, adapter_weight_scales_pixel])
+image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
+image
 ```

-</hfoption>
-</hfoptions>
+![block-lora-mixed](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_mixed.png)

-## Hotswapping
+## Manage adapters

-Hotswapping LoRAs is an efficient way to work with multiple LoRAs while avoiding accumulating memory from multiple calls to [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] and in some cases, recompilation, if a model is compiled. This workflow requires a loaded LoRA because the new LoRA weights are swapped in place for the existing loaded LoRA.
+You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, use the [`~diffusers.loaders.StableDiffusionLoraLoaderMixin.get_active_adapters`] method to check the list of active adapters:

 ```py
-import torch
-from diffusers import DiffusionPipeline
-
-# load base model and LoRAs
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/ikea-instructions-lora-sdxl",
-    weight_name="ikea_instructions_xl_v1_5.safetensors",
-    adapter_name="ikea"
-)
+active_adapters = pipe.get_active_adapters()
+active_adapters
+["toy", "pixel"]
 ```

-> [!WARNING]
-> Hotswapping is unsupported for LoRAs that target the text encoder.
-
-Set `hotswap=True` in [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] to swap the second LoRA. Use the `adapter_name` parameter to indicate which LoRA to swap (`default_0` is the default name).
+You can also get the active adapters of each pipeline component with [`~diffusers.loaders.StableDiffusionLoraLoaderMixin.get_list_adapters`]:

 ```py
-pipeline.load_lora_weights(
-    "lordjia/by-feng-zikai",
-    hotswap=True,
-    adapter_name="ikea"
-)
+list_adapters_component_wise = pipe.get_list_adapters()
+list_adapters_component_wise
+{"text_encoder": ["toy", "pixel"], "unet": ["toy", "pixel"], "text_encoder_2": ["toy", "pixel"]}
 ```

-### Compiled models
-
-For compiled models, use [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`] to avoid recompilation when hotswapping LoRAs. This method should be called *before* loading the first LoRA and `torch.compile` should be called *after* loading the first LoRA.
-
-> [!TIP]
-> The [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`] method isn't always necessary if the second LoRA targets the identical LoRA ranks and scales as the first LoRA.
-
-Within [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`], the `target_rank` parameter is important for setting the rank for all LoRA adapters. Setting it to `max_rank` sets it to the highest value. For LoRAs with different ranks, you set it to a higher rank value. The default rank value is 128.
+The [`~loaders.peft.PeftAdapterMixin.delete_adapters`] function completely removes an adapter and their LoRA layers from a model.

 ```py
-import torch
-from diffusers import DiffusionPipeline
-
-# load base model and LoRAs
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-# 1. enable_lora_hotswap
-pipeline.enable_lora_hotswap(target_rank=max_rank)
-pipeline.load_lora_weights(
-    "ostris/ikea-instructions-lora-sdxl",
-    weight_name="ikea_instructions_xl_v1_5.safetensors",
-    adapter_name="ikea"
-)
-# 2. torch.compile
-pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
-
-# 3. hotswap
-pipeline.load_lora_weights(
-    "lordjia/by-feng-zikai",
-    hotswap=True,
-    adapter_name="ikea"
-)
+pipe.delete_adapters("toy")
+pipe.get_active_adapters()
+["pixel"]
 ```

-> [!TIP]
-> Move your code inside the `with torch._dynamo.config.patch(error_on_recompile=True)` context manager to detect if a model was recompiled. If a model is recompiled despite following all the steps above, please open an [issue](https://github.com/huggingface/diffusers/issues) with a reproducible example.
+## PeftInputAutocastDisableHook

-There are still scenarios where recompulation is unavoidable, such as when the hotswapped LoRA targets more layers than the initial adapter. Try to load the LoRA that targets the most layers *first*. For more details about this limitation, refer to the PEFT [hotswapping](https://huggingface.co/docs/peft/main/en/package_reference/hotswap#peft.utils.hotswap.hotswap_adapter) docs.
-
-## Merge
-
-The weights from each LoRA can be merged together to produce a blend of multiple existing styles. There are several methods for merging LoRAs, each of which differ in *how* the weights are merged (may affect generation quality).
-
-### set_adapters
-
-The [`~loaders.PeftAdapterMixin.set_adapters`] method merges LoRAs by concatenating their weighted matrices. Pass the LoRA names to [`~loaders.PeftAdapterMixin.set_adapters`] and use the `adapter_weights` parameter to control the scaling of each LoRA. For example, if `adapter_weights=[0.5, 0.5]`, the output is an average of both LoRAs.
-
-> [!TIP]
-> The `"scale"` parameter determines how much of the merged LoRA to apply. See the [Weight scale](#weight-scale) section for more details.
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/ikea-instructions-lora-sdxl",
-    weight_name="ikea_instructions_xl_v1_5.safetensors",
-    adapter_name="ikea"
-)
-pipeline.load_lora_weights(
-    "lordjia/by-feng-zikai",
-    weight_name="fengzikai_v1.0_XL.safetensors",
-    adapter_name="feng"
-)
-pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
-# use by Feng Zikai to activate the lordjia/by-feng-zikai LoRA
-pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai", cross_attention_kwargs={"scale": 1.0}).images[0]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lora_merge_set_adapters.png"/>
-</div>
-
-### add_weighted_adapter
-
-> [!TIP]
-> This is an experimental method and you can refer to PEFTs [Model merging](https://huggingface.co/docs/peft/developer_guides/model_merging) for more details. Take a look at this [issue](https://github.com/huggingface/diffusers/issues/6892) if you're interested in the motivation and design behind this integration.
-
-The [`~peft.LoraModel.add_weighted_adapter`] method enables more efficient merging methods like [TIES](https://huggingface.co/papers/2306.01708) or [DARE](https://huggingface.co/papers/2311.03099). These merging methods remove redundant and potentially interfering parameters from merged models. Keep in mind the LoRA ranks need to have identical ranks to be merged.
-
-Make sure the latest stable version of Diffusers and PEFT is installed.
-
-```bash
-pip install -U -q diffusers peft
-```
-
-Load a UNET that corresponds to the LoRA UNet.
-
-```py
-import copy
-import torch
-from diffusers import AutoModel, DiffusionPipeline
-from peft import get_peft_model, LoraConfig, PeftModel
-
-unet = AutoModel.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-    variant="fp16",
-    subfolder="unet",
-).to("cuda")
-```
-
-Load a pipeline, pass the UNet to it, and load a LoRA.
-
-```py
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    variant="fp16",
-    torch_dtype=torch.float16,
-    unet=unet
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/ikea-instructions-lora-sdxl",
-    weight_name="ikea_instructions_xl_v1_5.safetensors",
-    adapter_name="ikea"
-)
-```
-
-Create a [`~peft.PeftModel`] from the LoRA checkpoint by combining the first UNet you loaded and the LoRA UNet from the pipeline.
-
-```py
-sdxl_unet = copy.deepcopy(unet)
-ikea_peft_model = get_peft_model(
-    sdxl_unet,
-    pipeline.unet.peft_config["ikea"],
-    adapter_name="ikea"
-)
-
-original_state_dict = {f"base_model.model.{k}": v for k, v in pipeline.unet.state_dict().items()}
-ikea_peft_model.load_state_dict(original_state_dict, strict=True)
-```
-
-> [!TIP]
-> You can save and reuse the `ikea_peft_model` by pushing it to the Hub as shown below.
-> ```py
-> ikea_peft_model.push_to_hub("ikea_peft_model", token=TOKEN)
-> ```
-
-Repeat this process and create a [`~peft.PeftModel`] for the second LoRA.
-
-```py
-pipeline.delete_adapters("ikea")
-sdxl_unet.delete_adapters("ikea")
-
-pipeline.load_lora_weights(
-    "lordjia/by-feng-zikai",
-    weight_name="fengzikai_v1.0_XL.safetensors",
-    adapter_name="feng"
-)
-pipeline.set_adapters(adapter_names="feng")
-
-feng_peft_model = get_peft_model(
-    sdxl_unet,
-    pipeline.unet.peft_config["feng"],
-    adapter_name="feng"
-)
-
-original_state_dict = {f"base_model.model.{k}": v for k, v in pipe.unet.state_dict().items()}
-feng_peft_model.load_state_dict(original_state_dict, strict=True)
-```
-
-Load a base UNet model and load the adapters.
-
-```py
-base_unet = AutoModel.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16,
-    use_safetensors=True,
-    variant="fp16",
-    subfolder="unet",
-).to("cuda")
-
-model = PeftModel.from_pretrained(
-    base_unet,
-    "stevhliu/ikea_peft_model",
-    use_safetensors=True,
-    subfolder="ikea",
-    adapter_name="ikea"
-)
-model.load_adapter(
-    "stevhliu/feng_peft_model",
-    use_safetensors=True,
-    subfolder="feng",
-    adapter_name="feng"
-)
-```
-
-Merge the LoRAs with [`~peft.LoraModel.add_weighted_adapter`] and specify how you want to merge them with `combination_type`. The example below uses the `"dare_linear"` method (refer to this [blog post](https://huggingface.co/blog/peft_merging) to learn more about these merging methods), which randomly prunes some weights and then performs a weighted sum of the tensors based on the set weightage of each LoRA in `weights`.
-
-Activate the merged LoRAs with [`~loaders.PeftAdapterMixin.set_adapters`].
-
-```py
-model.add_weighted_adapter(
-    adapters=["ikea", "feng"],
-    combination_type="dare_linear",
-    weights=[1.0, 1.0],
-    adapter_name="ikea-feng"
-)
-model.set_adapters("ikea-feng")
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    unet=model,
-    variant="fp16",
-    torch_dtype=torch.float16,
-).to("cuda")
-pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai").images[0]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ikea-feng-dare-linear.png"/>
-</div>
-
-### fuse_lora
-
-The [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] method fuses the LoRA weights directly with the original UNet and text encoder weights of the underlying model. This reduces the overhead of loading the underlying model for each LoRA because it only loads the model once, which lowers memory usage and increases inference speed.
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/ikea-instructions-lora-sdxl",
-    weight_name="ikea_instructions_xl_v1_5.safetensors",
-    adapter_name="ikea"
-)
-pipeline.load_lora_weights(
-    "lordjia/by-feng-zikai",
-    weight_name="fengzikai_v1.0_XL.safetensors",
-    adapter_name="feng"
-)
-pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
-```
-
-Call [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] to fuse them. The `lora_scale` parameter controls how much to scale the output by with the LoRA weights. It is important to make this adjustment now because passing `scale` to `cross_attention_kwargs` won't work in the pipeline.
-
-```py
-pipeline.fuse_lora(adapter_names=["ikea", "feng"], lora_scale=1.0)
-```
-
-Unload the LoRA weights since they're already fused with the underlying model. Save the fused pipeline with either [`~DiffusionPipeline.save_pretrained`] to save it locally or [`~PushToHubMixin.push_to_hub`] to save it to the Hub.
-
-<hfoptions id="save">
-<hfoption id="save locally">
-
-```py
-pipeline.unload_lora_weights()
-pipeline.save_pretrained("path/to/fused-pipeline")
-```
-
-</hfoption>
-<hfoption id="save to Hub">
-
-```py
-pipeline.unload_lora_weights()
-pipeline.push_to_hub("fused-ikea-feng")
-```
-
-</hfoption>
-</hfoptions>
-
-The fused pipeline can now be quickly loaded for inference without requiring each LoRA to be separately loaded.
-
-```py
-pipeline = DiffusionPipeline.from_pretrained(
-    "username/fused-ikea-feng", torch_dtype=torch.float16,
-).to("cuda")
-pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai").images[0]
-```
-
-Use [`~loaders.LoraLoaderMixin.unfuse_lora`] to restore the underlying models weights, for example, if you want to use a different `lora_scale` value. You can only unfuse if there is a single LoRA fused. For example, it won't work with the pipeline from above because there are multiple fused LoRAs. In these cases, you'll need to reload the entire model.
-
-```py
-pipeline.unfuse_lora()
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fuse_lora.png"/>
-</div>
-
-## Manage
-
-Diffusers provides several methods to help you manage working with LoRAs. These methods can be especially useful if you're working with multiple LoRAs.
-
-### set_adapters
-
-[`~loaders.PeftAdapterMixin.set_adapters`] also activates the current LoRA to use if there are multiple active LoRAs. This allows you to switch between different LoRAs by specifying their name.
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_lora_weights(
-    "ostris/ikea-instructions-lora-sdxl",
-    weight_name="ikea_instructions_xl_v1_5.safetensors",
-    adapter_name="ikea"
-)
-pipeline.load_lora_weights(
-    "lordjia/by-feng-zikai",
-    weight_name="fengzikai_v1.0_XL.safetensors",
-    adapter_name="feng"
-)
-# activates the feng LoRA instead of the ikea LoRA
-pipeline.set_adapters("feng")
-```
-
-### save_lora_adapter
-
-Save an adapter with [`~loaders.PeftAdapterMixin.save_lora_adapter`].
-
-```py
-import torch
-from diffusers import AutoPipelineForText2Image
-
-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.unet.load_lora_adapter(
-    "jbilcke-hf/sdxl-cinematic-1",
-    weight_name="pytorch_lora_weights.safetensors",
-    adapter_name="cinematic"
-    prefix="unet"
-)
-pipeline.save_lora_adapter("path/to/save", adapter_name="cinematic")
-```
-
-### unload_lora_weights
-
-The [`~loaders.lora_base.LoraBaseMixin.unload_lora_weights`] method unloads any LoRA weights in the pipeline to restore the underlying model weights.
-
-```py
-pipeline.unload_lora_weights()
-```
-
-### disable_lora
-
-The [`~loaders.PeftAdapterMixin.disable_lora`] method disables all LoRAs (but they're still kept on the pipeline) and restores the pipeline to the underlying model weights.
-
-```py
-pipeline.disable_lora()
-```
-
-### get_active_adapters
-
-The [`~loaders.lora_base.LoraBaseMixin.get_active_adapters`] method returns a list of active LoRAs attached to a pipeline.
-
-```py
-pipeline.get_active_adapters()
-["cereal", "ikea"]
-```
-
-### get_list_adapters
-
-The [`~loaders.lora_base.LoraBaseMixin.get_list_adapters`] method returns the active LoRAs for each component in the pipeline.
-
-```py
-pipeline.get_list_adapters()
-{"unet": ["cereal", "ikea"], "text_encoder_2": ["cereal"]}
-```
-
-### delete_adapters
-
-The [`~loaders.PeftAdapterMixin.delete_adapters`] method completely removes a LoRA and its layers from a model.
-
-```py
-pipeline.delete_adapters("ikea")
-```
-
-## Resources
-
-Browse the [LoRA Studio](https://lorastudio.co/models) for different LoRAs to use or you can upload your favorite LoRAs from Civitai to the Hub with the Space below.
-
-<iframe
-	src="https://multimodalart-civitai-to-hf.hf.space"
-	frameborder="0"
-	width="850"
-	height="450"
-></iframe>
-
-You can find additional LoRAs in the [FLUX LoRA the Explorer](https://huggingface.co/spaces/multimodalart/flux-lora-the-explorer) and [LoRA the Explorer](https://huggingface.co/spaces/multimodalart/LoraTheExplorer) Spaces.
+[[autodoc]] hooks.layerwise_casting.PeftInputAutocastDisableHook
@@ -12,28 +12,46 @@ specific language governing permissions and limitations under the License.

 # ControlNet

-[ControlNet](https://huggingface.co/papers/2302.05543) is an adapter that enables controllable generation such as generating an image of a cat in a *specific pose* or following the lines in a sketch of a *specific* cat. It works by adding a smaller network of "zero convolution" layers and progressively training these to avoid disrupting with the original model. The original model parameters are frozen to avoid retraining it.
+ControlNet is a type of model for controlling image diffusion models by conditioning the model with an additional input image. There are many types of conditioning inputs (canny edge, user sketching, human pose, depth, and more) you can use to control a diffusion model. This is hugely useful because it affords you greater control over image generation, making it easier to generate specific images without experimenting with different text prompts or denoising values as much.

-A ControlNet is conditioned on extra visual information or "structural controls" (canny edge, depth maps, human pose, etc.) that can be combined with text prompts to generate images that are guided by the visual input.
+<Tip>

-> [!TIP]
-> ControlNets are available to many models such as [Flux](../api/pipelines/controlnet_flux), [Hunyuan-DiT](../api/pipelines/controlnet_hunyuandit), [Stable Diffusion 3](../api/pipelines/controlnet_sd3), and more. The examples in this guide use Flux and Stable Diffusion XL.
+Check out Section 3.5 of the [ControlNet](https://huggingface.co/papers/2302.05543) paper v1 for a list of ControlNet implementations on various conditioning inputs. You can find the official Stable Diffusion ControlNet conditioned models on [lllyasviel](https://huggingface.co/lllyasviel)'s Hub profile, and more [community-trained](https://huggingface.co/models?other=stable-diffusion&other=controlnet) ones on the Hub.

-Load a ControlNet conditioned on a specific control, such as canny edge, and pass it to the pipeline in [`~DiffusionPipeline.from_pretrained`].
+For Stable Diffusion XL (SDXL) ControlNet models, you can find them on the 🤗 [Diffusers](https://huggingface.co/diffusers) Hub organization, or you can browse [community-trained](https://huggingface.co/models?other=stable-diffusion-xl&other=controlnet) ones on the Hub.

-<hfoptions id="usage">
-<hfoption id="text-to-image">
+</Tip>

-Generate a canny image with [opencv-python](https://github.com/opencv/opencv-python).
+A ControlNet model has two sets of weights (or blocks) connected by a zero-convolution layer:
+
+- a *locked copy* keeps everything a large pretrained diffusion model has learned
+- a *trainable copy* is trained on the additional conditioning input
+
+Since the locked copy preserves the pretrained model, training and implementing a ControlNet on a new conditioning input is as fast as finetuning any other model because you aren't training the model from scratch.
+
+This guide will show you how to use ControlNet for text-to-image, image-to-image, inpainting, and more! There are many types of ControlNet conditioning inputs to choose from, but in this guide we'll only focus on several of them. Feel free to experiment with other conditioning inputs!
+
+Before you begin, make sure you have the following libraries installed:

 ```py
+# uncomment to install the necessary libraries in Colab
+#!pip install -q diffusers transformers accelerate opencv-python
+```
+
+## Text-to-image
+
+For text-to-image, you normally pass a text prompt to the model. But with ControlNet, you can specify an additional conditioning input. Let's condition the model with a canny image, a white outline of an image on a black background. This way, the ControlNet can use the canny image as a control to guide the model to generate an image with the same outline.
+
+Load an image and use the [opencv-python](https://github.com/opencv/opencv-python) library to extract the canny image:
+
+```py
+from diffusers.utils import load_image, make_image_grid
+from PIL import Image
 import cv2
 import numpy as np
-from PIL import Image
-from diffusers.utils import load_image

 original_image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png"
+    "https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
 )

 image = np.array(original_image)
@@ -47,300 +65,523 @@ image = np.concatenate([image, image, image], axis=2)
 canny_image = Image.fromarray(image)
 ```

-Pass the canny image to the pipeline. Use the `controlnet_conditioning_scale` parameter to determine how much weight to assign to the control.
-
-```py
-import torch
-from diffusers.utils import load_image
-from diffusers import FluxControlNetPipeline, FluxControlNetModel
-
-controlnet = FluxControlNetModel.from_pretrained(
-    "InstantX/FLUX.1-dev-Controlnet-Canny", torch_dtype=torch.bfloat16
-)
-pipeline = FluxControlNetPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev", controlnet=controlnet, torch_dtype=torch.bfloat16
-).to("cuda")
-
-prompt = """
-A photorealistic overhead image of a cat reclining sideways in a flamingo pool floatie holding a margarita. 
-The cat is floating leisurely in the pool and completely relaxed and happy.
-"""
-
-pipeline(
-    prompt, 
-    control_image=canny_image,
-    controlnet_conditioning_scale=0.5,
-    num_inference_steps=50, 
-    guidance_scale=3.5,
-).images[0]
-```
-
-<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png" width="300" alt="Generated image (prompt only)"/>
-    <figcaption style="text-align: center;">original image</figcaption>
-  </figure>
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png" width="300" alt="Control image (Canny edges)"/>
-    <figcaption style="text-align: center;">canny image</figcaption>
-  </figure>
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat-generated.png" width="300" alt="Generated image (ControlNet + prompt)"/>
-    <figcaption style="text-align: center;">generated image</figcaption>
-  </figure>
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">canny image</figcaption>
+  </div>
 </div>

+Next, load a ControlNet model conditioned on canny edge detection and pass it to the [`StableDiffusionControlNetPipeline`]. Use the faster [`UniPCMultistepScheduler`] and enable model offloading to speed up inference and reduce memory usage.

-</hfoption>
-<hfoption id="image-to-image">
+```py
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
+import torch

-Generate a depth map with a depth estimation pipeline from Transformers.
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionControlNetPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
+)
+
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+pipe.enable_model_cpu_offload()
+```
+
+Now pass your prompt and canny image to the pipeline:
+
+```py
+output = pipe(
+    "the mona lisa", image=canny_image
+).images[0]
+make_image_grid([original_image, canny_image, output], rows=1, cols=3)
+```
+
+<div class="flex justify-center">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-text2img.png"/>
+</div>
+
+## Image-to-image
+
+For image-to-image, you'd typically pass an initial image and a prompt to the pipeline to generate a new image. With ControlNet, you can pass an additional conditioning input to guide the model. Let's condition the model with a depth map, an image which contains spatial information. This way, the ControlNet can use the depth map as a control to guide the model to generate an image that preserves spatial information.
+
+You'll use the [`StableDiffusionControlNetImg2ImgPipeline`] for this task, which is different from the [`StableDiffusionControlNetPipeline`] because it allows you to pass an initial image as the starting point for the image generation process.
+
+Load an image and use the `depth-estimation` [`~transformers.Pipeline`] from 🤗 Transformers to extract the depth map of an image:

 ```py
 import torch
 import numpy as np
-from PIL import Image
-from transformers import DPTImageProcessor, DPTForDepthEstimation
-from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL
-from diffusers.utils import load_image

+from transformers import pipeline
+from diffusers.utils import load_image, make_image_grid

-depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
-feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
-
-def get_depth_map(image):
-    image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda")
-    with torch.no_grad(), torch.autocast("cuda"):
-        depth_map = depth_estimator(image).predicted_depth
-
-    depth_map = torch.nn.functional.interpolate(
-        depth_map.unsqueeze(1),
-        size=(1024, 1024),
-        mode="bicubic",
-        align_corners=False,
-    )
-    depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
-    depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
-    depth_map = (depth_map - depth_min) / (depth_max - depth_min)
-    image = torch.cat([depth_map] * 3, dim=1)
-    image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
-    image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
-    return image
-
-depth_image = get_depth_map(image)
-```
-
-Pass the depth map to the pipeline. Use the `controlnet_conditioning_scale` parameter to determine how much weight to assign to the control.
-
-```py
-controlnet = ControlNetModel.from_pretrained(
-    "diffusers/controlnet-depth-sdxl-1.0-small",
-    torch_dtype=torch.float16,
-)
-vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
-pipeline = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    controlnet=controlnet,
-    vae=vae,
-    torch_dtype=torch.float16,
-).to("cuda")
-
-prompt = """
-A photorealistic overhead image of a cat reclining sideways in a flamingo pool floatie holding a margarita. 
-The cat is floating leisurely in the pool and completely relaxed and happy.
-"""
 image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png"
-).resize((1024, 1024))
-controlnet_conditioning_scale = 0.5 
-pipeline(
-    prompt,
-    image=image,
-    control_image=depth_image,
-    controlnet_conditioning_scale=controlnet_conditioning_scale,
-    strength=0.99,
-    num_inference_steps=100,
-).images[0]
-```
-
-<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png" width="300" alt="Generated image (prompt only)"/>
-    <figcaption style="text-align: center;">original image</figcaption>
-  </figure>
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_depth_image.png" width="300" alt="Control image (Canny edges)"/>
-    <figcaption style="text-align: center;">depth map</figcaption>
-  </figure>
-  <figure> 
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_depth_cat.png" width="300" alt="Generated image (ControlNet + prompt)"/>
-    <figcaption style="text-align: center;">generated image</figcaption>
-  </figure>
-</div>
-
-</hfoption>
-<hfoption id="inpainting">
-
-Generate a mask image and convert it to a tensor to mark the pixels in the original image as masked if the corresponding pixel in the mask image is over a certain threshold.
-
-```py
-import cv2
-import torch
-import numpy as np
-from PIL import Image
-from diffusers.utils import load_image
-from diffusers import StableDiffusionXLControlNetInpaintPipeline, ControlNetModel
-
-init_image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png"
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-img2img.jpg"
 )
-init_image = init_image.resize((1024, 1024))
-mask_image = load_image(
-    "/content/cat_mask.png"
-)
-mask_image = mask_image.resize((1024, 1024))

-def make_canny_condition(image):
+def get_depth_map(image, depth_estimator):
+    image = depth_estimator(image)["depth"]
    image = np.array(image)
-    image = cv2.Canny(image, 100, 200)
    image = image[:, :, None]
    image = np.concatenate([image, image, image], axis=2)
-    image = Image.fromarray(image)
-    return image
+    detected_map = torch.from_numpy(image).float() / 255.0
+    depth_map = detected_map.permute(2, 0, 1)
+    return depth_map

-control_image = make_canny_condition(init_image)
+depth_estimator = pipeline("depth-estimation")
+depth_map = get_depth_map(image, depth_estimator).unsqueeze(0).half().to("cuda")
 ```

-Pass the mask and control image to the pipeline. Use the `controlnet_conditioning_scale` parameter to determine how much weight to assign to the control.
+Next, load a ControlNet model conditioned on depth maps and pass it to the [`StableDiffusionControlNetImg2ImgPipeline`]. Use the faster [`UniPCMultistepScheduler`] and enable model offloading to speed up inference and reduce memory usage.

 ```py
-controlnet = ControlNetModel.from_pretrained(
-    "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
+from diffusers import StableDiffusionControlNetImg2ImgPipeline, ControlNetModel, UniPCMultistepScheduler
+import torch
+
+controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11f1p_sd15_depth", torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
 )
-pipeline = StableDiffusionXLControlNetInpaintPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
+
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+pipe.enable_model_cpu_offload()
+```
+
+Now pass your prompt, initial image, and depth map to the pipeline:
+
+```py
+output = pipe(
+    "lego batman and robin", image=image, control_image=depth_map,
+).images[0]
+make_image_grid([image, output], rows=1, cols=2)
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-img2img.jpg"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-img2img-2.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
+  </div>
+</div>
+
+## Inpainting
+
+For inpainting, you need an initial image, a mask image, and a prompt describing what to replace the mask with. ControlNet models allow you to add another control image to condition a model with. Let’s condition the model with an inpainting mask. This way, the ControlNet can use the inpainting mask as a control to guide the model to generate an image within the mask area.
+
+Load an initial image and a mask image:
+
+```py
+from diffusers.utils import load_image, make_image_grid
+
+init_image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint.jpg"
 )
-pipeline(
-    "a cute and fluffy bunny rabbit",
-    num_inference_steps=100,
-    strength=0.99,
-    controlnet_conditioning_scale=0.5,
+init_image = init_image.resize((512, 512))
+
+mask_image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint-mask.jpg"
+)
+mask_image = mask_image.resize((512, 512))
+make_image_grid([init_image, mask_image], rows=1, cols=2)
+```
+
+Create a function to prepare the control image from the initial and mask images. This'll create a tensor to mark the pixels in `init_image` as masked if the corresponding pixel in `mask_image` is over a certain threshold.
+
+```py
+import numpy as np
+import torch
+
+def make_inpaint_condition(image, image_mask):
+    image = np.array(image.convert("RGB")).astype(np.float32) / 255.0
+    image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0
+
+    assert image.shape[0:1] == image_mask.shape[0:1]
+    image[image_mask > 0.5] = -1.0  # set as masked pixel
+    image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
+    image = torch.from_numpy(image)
+    return image
+
+control_image = make_inpaint_condition(init_image, mask_image)
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint.jpg"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint-mask.jpg"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">mask image</figcaption>
+  </div>
+</div>
+
+Load a ControlNet model conditioned on inpainting and pass it to the [`StableDiffusionControlNetInpaintPipeline`]. Use the faster [`UniPCMultistepScheduler`] and enable model offloading to speed up inference and reduce memory usage.
+
+```py
+from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, UniPCMultistepScheduler
+
+controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_inpaint", torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
+)
+
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+pipe.enable_model_cpu_offload()
+```
+
+Now pass your prompt, initial image, mask image, and control image to the pipeline:
+
+```py
+output = pipe(
+    "corgi face with large ears, detailed, pixar, animated, disney",
+    num_inference_steps=20,
+    eta=1.0,
    image=init_image,
    mask_image=mask_image,
    control_image=control_image,
 ).images[0]
+make_image_grid([init_image, mask_image, output], rows=1, cols=3)
 ```

-<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png" width="300" alt="Generated image (prompt only)"/>
-    <figcaption style="text-align: center;">original image</figcaption>
-  </figure>
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat_mask.png" width="300" alt="Control image (Canny edges)"/>
-    <figcaption style="text-align: center;">mask image</figcaption>
-  </figure>
-  <figure> 
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_rabbit_inpaint.png" width="300" alt="Generated image (ControlNet + prompt)"/>
-    <figcaption style="text-align: center;">generated image</figcaption>
-  </figure>
+<div class="flex justify-center">
+  <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint-result.png"/>
 </div>

-</hfoption>
-</hfoptions>
+## Guess mode

-## Multi-ControlNet
+[Guess mode](https://github.com/lllyasviel/ControlNet/discussions/188) does not require supplying a prompt to a ControlNet at all! This forces the ControlNet encoder to do its best to "guess" the contents of the input control map (depth map, pose estimation, canny edge, etc.).

-You can compose multiple ControlNet conditionings, such as canny image and a depth map, to create a *MultiControlNet*. For the best rersults, you should mask conditionings so they don't overlap and experiment with different `controlnet_conditioning_scale` parameters to adjust how much weight is assigned to each control input.
+Guess mode adjusts the scale of the output residuals from a ControlNet by a fixed ratio depending on the block depth. The shallowest `DownBlock` corresponds to 0.1, and as the blocks get deeper, the scale increases exponentially such that the scale of the `MidBlock` output becomes 1.0.

-The example below composes a canny image and depth map.
+<Tip>

-Pass the ControlNets as a list to the pipeline and resize the images to the expected input size.
+Guess mode does not have any impact on prompt conditioning and you can still provide a prompt if you want.
+
+</Tip>
+
+Set `guess_mode=True` in the pipeline, and it is [recommended](https://github.com/lllyasviel/ControlNet#guess-mode--non-prompt-mode) to set the `guidance_scale` value between 3.0 and 5.0.

 ```py
+from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
+from diffusers.utils import load_image, make_image_grid
+import numpy as np
 import torch
+from PIL import Image
+import cv2
+
+controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", use_safetensors=True)
+pipe = StableDiffusionControlNetPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, use_safetensors=True).to("cuda")
+
+original_image = load_image("https://huggingface.co/takuma104/controlnet_dev/resolve/main/bird_512x512.png")
+
+image = np.array(original_image)
+
+low_threshold = 100
+high_threshold = 200
+
+image = cv2.Canny(image, low_threshold, high_threshold)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image)
+
+image = pipe("", image=canny_image, guess_mode=True, guidance_scale=3.0).images[0]
+make_image_grid([original_image, canny_image, image], rows=1, cols=3)
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare_guess_mode/output_images/diffusers/output_bird_canny_0.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">regular mode with prompt</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare_guess_mode/output_images/diffusers/output_bird_canny_0_gm.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">guess mode without prompt</figcaption>
+  </div>
+</div>
+
+## ControlNet with Stable Diffusion XL
+
+There aren't too many ControlNet models compatible with Stable Diffusion XL (SDXL) at the moment, but we've trained two full-sized ControlNet models for SDXL conditioned on canny edge detection and depth maps. We're also experimenting with creating smaller versions of these SDXL-compatible ControlNet models so it is easier to run on resource-constrained hardware. You can find these checkpoints on the [🤗 Diffusers Hub organization](https://huggingface.co/diffusers)!
+
+Let's use a SDXL ControlNet conditioned on canny images to generate an image. Start by loading an image and prepare the canny image:
+
+```py
 from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
+from diffusers.utils import load_image, make_image_grid
+from PIL import Image
+import cv2
+import numpy as np
+import torch
+
+original_image = load_image(
+    "https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+)
+
+image = np.array(original_image)
+
+low_threshold = 100
+high_threshold = 200
+
+image = cv2.Canny(image, low_threshold, high_threshold)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image)
+make_image_grid([original_image, canny_image], rows=1, cols=2)
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hf-logo-canny.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">canny image</figcaption>
+  </div>
+</div>
+
+Load a SDXL ControlNet model conditioned on canny edge detection and pass it to the [`StableDiffusionXLControlNetPipeline`]. You can also enable model offloading to reduce memory usage.
+
+```py
+controlnet = ControlNetModel.from_pretrained(
+    "diffusers/controlnet-canny-sdxl-1.0",
+    torch_dtype=torch.float16,
+    use_safetensors=True
+)
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    controlnet=controlnet,
+    vae=vae,
+    torch_dtype=torch.float16,
+    use_safetensors=True
+)
+pipe.enable_model_cpu_offload()
+```
+
+Now pass your prompt (and optionally a negative prompt if you're using one) and canny image to the pipeline:
+
+<Tip>
+
+The [`controlnet_conditioning_scale`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet#diffusers.StableDiffusionControlNetPipeline.__call__.controlnet_conditioning_scale) parameter determines how much weight to assign to the conditioning inputs. A value of 0.5 is recommended for good generalization, but feel free to experiment with this number!
+
+</Tip>
+
+```py
+prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+negative_prompt = 'low quality, bad quality, sketches'
+
+image = pipe(
+    prompt,
+    negative_prompt=negative_prompt,
+    image=canny_image,
+    controlnet_conditioning_scale=0.5,
+).images[0]
+make_image_grid([original_image, canny_image, image], rows=1, cols=3)
+```
+
+<div class="flex justify-center">
+    <img class="rounded-xl" src="https://huggingface.co/diffusers/controlnet-canny-sdxl-1.0/resolve/main/out_hug_lab_7.png"/>
+</div>
+
+You can use [`StableDiffusionXLControlNetPipeline`] in guess mode as well by setting the parameter to `True`:
+
+```py
+from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
+from diffusers.utils import load_image, make_image_grid
+import numpy as np
+import torch
+import cv2
+from PIL import Image
+
+prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
+negative_prompt = "low quality, bad quality, sketches"
+
+original_image = load_image(
+    "https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
+)
+
+controlnet = ControlNetModel.from_pretrained(
+    "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True
+)
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, vae=vae, torch_dtype=torch.float16, use_safetensors=True
+)
+pipe.enable_model_cpu_offload()
+
+image = np.array(original_image)
+image = cv2.Canny(image, 100, 200)
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image)
+
+image = pipe(
+    prompt, negative_prompt=negative_prompt, controlnet_conditioning_scale=0.5, image=canny_image, guess_mode=True,
+).images[0]
+make_image_grid([original_image, canny_image, image], rows=1, cols=3)
+```
+
+<Tip>
+
+You can use a refiner model with `StableDiffusionXLControlNetPipeline` to improve image quality, just like you can with a regular `StableDiffusionXLPipeline`.
+See the [Refine image quality](./sdxl#refine-image-quality) section to learn how to use the refiner model.
+Make sure to use `StableDiffusionXLControlNetPipeline` and pass `image` and `controlnet_conditioning_scale`.
+
+```py
+base = StableDiffusionXLControlNetPipeline(...)
+image = base(
+    prompt=prompt,
+    controlnet_conditioning_scale=0.5,
+    image=canny_image,
+    num_inference_steps=40,
+    denoising_end=0.8,
+    output_type="latent",
+).images
+# rest exactly as with StableDiffusionXLPipeline
+```
+
+</Tip>
+
+## MultiControlNet
+
+<Tip>
+
+Replace the SDXL model with a model like [stable-diffusion-v1-5/stable-diffusion-v1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) to use multiple conditioning inputs with Stable Diffusion models.
+
+</Tip>
+
+You can compose multiple ControlNet conditionings from different image inputs to create a *MultiControlNet*. To get better results, it is often helpful to:
+
+1. mask conditionings such that they don't overlap (for example, mask the area of a canny image where the pose conditioning is located)
+2. experiment with the [`controlnet_conditioning_scale`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet#diffusers.StableDiffusionControlNetPipeline.__call__.controlnet_conditioning_scale) parameter to determine how much weight to assign to each conditioning input
+
+In this example, you'll combine a canny image and a human pose estimation image to generate a new image.
+
+Prepare the canny image conditioning:
+
+```py
+from diffusers.utils import load_image, make_image_grid
+from PIL import Image
+import numpy as np
+import cv2
+
+original_image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/landscape.png"
+)
+image = np.array(original_image)
+
+low_threshold = 100
+high_threshold = 200
+
+image = cv2.Canny(image, low_threshold, high_threshold)
+
+# zero out middle columns of image where pose will be overlaid
+zero_start = image.shape[1] // 4
+zero_end = zero_start + image.shape[1] // 2
+image[:, zero_start:zero_end] = 0
+
+image = image[:, :, None]
+image = np.concatenate([image, image, image], axis=2)
+canny_image = Image.fromarray(image)
+make_image_grid([original_image, canny_image], rows=1, cols=2)
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/landscape.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/controlnet/landscape_canny_masked.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">canny image</figcaption>
+  </div>
+</div>
+
+For human pose estimation, install [controlnet_aux](https://github.com/patrickvonplaten/controlnet_aux):
+
+```py
+# uncomment to install the necessary library in Colab
+#!pip install -q controlnet-aux
+```
+
+Prepare the human pose estimation conditioning:
+
+```py
+from controlnet_aux import OpenposeDetector
+
+openpose = OpenposeDetector.from_pretrained("lllyasviel/ControlNet")
+original_image = load_image(
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/person.png"
+)
+openpose_image = openpose(original_image)
+make_image_grid([original_image, openpose_image], rows=1, cols=2)
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/person.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/controlnet/person_pose.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">human pose image</figcaption>
+  </div>
+</div>
+
+Load a list of ControlNet models that correspond to each conditioning, and pass them to the [`StableDiffusionXLControlNetPipeline`]. Use the faster [`UniPCMultistepScheduler`] and enable model offloading to reduce memory usage.
+
+```py
+from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL, UniPCMultistepScheduler
+import torch

 controlnets = [
    ControlNetModel.from_pretrained(
-        "diffusers/controlnet-depth-sdxl-1.0-small", torch_dtype=torch.float16
+        "thibaud/controlnet-openpose-sdxl-1.0", torch_dtype=torch.float16
    ),
    ControlNetModel.from_pretrained(
-        "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16,
+        "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True
    ),
 ]

-vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
-pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnets, vae=vae, torch_dtype=torch.float16
-).to("cuda")
-
-prompt = """
-a relaxed rabbit sitting on a striped towel next to a pool with a tropical drink nearby, 
-bright sunny day, vacation scene, 35mm photograph, film, professional, 4k, highly detailed
-"""
-negative_prompt = "lowres, bad anatomy, worst quality, low quality, deformed, ugly"
-
-images = [canny_image.resize((1024, 1024)), depth_image.resize((1024, 1024))]
-
-pipeline(
-    prompt,
-    negative_prompt=negative_prompt,
-    image=images,
-    num_inference_steps=100,
-    controlnet_conditioning_scale=[0.5, 0.5],
-    strength=0.7,
-).images[0]
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
+pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnets, vae=vae, torch_dtype=torch.float16, use_safetensors=True
+)
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+pipe.enable_model_cpu_offload()
 ```

-<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png" width="300" alt="Generated image (prompt only)"/>
-    <figcaption style="text-align: center;">canny image</figcaption>
-  </figure>
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/multicontrolnet_depth.png" width="300" alt="Control image (Canny edges)"/>
-    <figcaption style="text-align: center;">depth map</figcaption>
-  </figure>
-  <figure> 
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_multi_controlnet.png" width="300" alt="Generated image (ControlNet + prompt)"/>
-    <figcaption style="text-align: center;">generated image</figcaption>
-  </figure>
-</div>
-
-## guess_mode
-
-[Guess mode](https://github.com/lllyasviel/ControlNet/discussions/188) generates an image from **only** the control input (canny edge, depth map, pose, etc.) and without guidance from a prompt. It adjusts the scale of the ControlNet's output residuals by a fixed ratio depending on block depth. The earlier `DownBlock` is only scaled by `0.1` and the `MidBlock` is fully scaled by `1.0`.
+Now you can pass your prompt (an optional negative prompt if you're using one), canny image, and pose image to the pipeline:

 ```py
-import torch
-from diffusers.utils import load_iamge
-from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel
+prompt = "a giant standing in a fantasy landscape, best quality"
+negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"

-controlnet = ControlNetModel.from_pretrained(
-  "diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
-)
-pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
-  "stabilityai/stable-diffusion-xl-base-1.0",
-  controlnet=controlnet,
-  torch_dtype=torch.float16
-).to("cuda")
+generator = torch.manual_seed(1)

-canny_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png")
-pipeline(
-  "",
-  image=canny_image,
-  guess_mode=True
-).images[0]
+images = [openpose_image.resize((1024, 1024)), canny_image.resize((1024, 1024))]
+
+images = pipe(
+    prompt,
+    image=images,
+    num_inference_steps=25,
+    generator=generator,
+    negative_prompt=negative_prompt,
+    num_images_per_prompt=3,
+    controlnet_conditioning_scale=[1.0, 0.8],
+).images
+make_image_grid([original_image, canny_image, openpose_image,
+                images[0].resize((512, 512)), images[1].resize((512, 512)), images[2].resize((512, 512))], rows=2, cols=3)
 ```

-<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png" width="300" alt="Control image (Canny edges)"/>
-    <figcaption style="text-align: center;">canny image</figcaption>
-  </figure>
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/guess_mode.png" width="300" alt="Generated image (Guess mode)"/>
-    <figcaption style="text-align: center;">generated image</figcaption>
-  </figure>
-</div>
+<div class="flex justify-center">
+	<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/multicontrolnet.png"/>
+</div>
@@ -1,35 +0,0 @@
-<!--Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# DreamBooth
-
-[DreamBooth](https://huggingface.co/papers/2208.12242) is a method for generating personalized images of a specific instance. It works by fine-tuning the model on 3-5 images of the subject (for example, a cat) that is associated with a unique identifier (`sks cat`). This allows you to use `sks cat` in your prompt to trigger the model to generate images of your cat in different settings, lighting, poses, and styles.
-
-DreamBooth checkpoints are typically a few GBs in size because it contains the full model weights.
-
-Load the DreamBooth checkpoint with [`~DiffusionPipeline.from_pretrained`] and include the unique identifier in the prompt to activate its generation.
-
-```py
-import torch
-from diffusers import AutoPipelineForText2Image
-
-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "sd-dreambooth-library/herge-style",
-    torch_dtype=torch.float16
-).to("cuda")
-prompt = "A cute sks herge_style brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration"
-pipeline(prompt).images[0]
-```
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_dreambooth.png" />
-</div>
@@ -485,7 +485,7 @@ image = image[:, :, None]
 image = np.concatenate([image, image, image], axis=2)
 canny_image = Image.fromarray(image).resize((1024, 1216))

-adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, variant="fp16").to("cuda")
+adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")

 unet = UNet2DConditionModel.from_pretrained(
    "latent-consistency/lcm-sdxl",
@@ -551,7 +551,7 @@ image = image[:, :, None]
 image = np.concatenate([image, image, image], axis=2)
 canny_image = Image.fromarray(image).resize((1024, 1024))

-adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, variant="fp16").to("cuda")
+adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")

 pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
@@ -0,0 +1,416 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Load adapters
+
+[[open-in-colab]]
+
+There are several [training](../training/overview) techniques for personalizing diffusion models to generate images of a specific subject or images in certain styles. Each of these training methods produces a different type of adapter. Some of the adapters generate an entirely new model, while other adapters only modify a smaller set of embeddings or weights. This means the loading process for each adapter is also different.
+
+This guide will show you how to load DreamBooth, textual inversion, and LoRA weights.
+
+<Tip>
+
+Feel free to browse the [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer), [LoRA the Explorer](https://huggingface.co/spaces/multimodalart/LoraTheExplorer), and the [Diffusers Models Gallery](https://huggingface.co/spaces/huggingface-projects/diffusers-gallery) for checkpoints and embeddings to use.
+
+</Tip>
+
+## DreamBooth
+
+[DreamBooth](https://dreambooth.github.io/) finetunes an *entire diffusion model* on just several images of a subject to generate images of that subject in new styles and settings. This method works by using a special word in the prompt that the model learns to associate with the subject image. Of all the training methods, DreamBooth produces the largest file size (usually a few GBs) because it is a full checkpoint model.
+
+Let's load the [herge_style](https://huggingface.co/sd-dreambooth-library/herge-style) checkpoint, which is trained on just 10 images drawn by Hergé, to generate images in that style. For it to work, you need to include the special word `herge_style` in your prompt to trigger the checkpoint:
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+
+pipeline = AutoPipelineForText2Image.from_pretrained("sd-dreambooth-library/herge-style", torch_dtype=torch.float16).to("cuda")
+prompt = "A cute herge_style brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration"
+image = pipeline(prompt).images[0]
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_dreambooth.png" />
+</div>
+
+## Textual inversion
+
+[Textual inversion](https://textual-inversion.github.io/) is very similar to DreamBooth and it can also personalize a diffusion model to generate certain concepts (styles, objects) from just a few images. This method works by training and finding new embeddings that represent the images you provide with a special word in the prompt. As a result, the diffusion model weights stay the same and the training process produces a relatively tiny (a few KBs) file.
+
+Because textual inversion creates embeddings, it cannot be used on its own like DreamBooth and requires another model.
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+
+pipeline = AutoPipelineForText2Image.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
+```
+
+Now you can load the textual inversion embeddings with the [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] method and generate some images. Let's load the [sd-concepts-library/gta5-artwork](https://huggingface.co/sd-concepts-library/gta5-artwork) embeddings and you'll need to include the special word `<gta5-artwork>` in your prompt to trigger it:
+
+```py
+pipeline.load_textual_inversion("sd-concepts-library/gta5-artwork")
+prompt = "A cute brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration, <gta5-artwork> style"
+image = pipeline(prompt).images[0]
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_txt_embed.png" />
+</div>
+
+Textual inversion can also be trained on undesirable things to create *negative embeddings* to discourage a model from generating images with those undesirable things like blurry images or extra fingers on a hand. This can be an easy way to quickly improve your prompt. You'll also load the embeddings with [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`], but this time, you'll need two more parameters:
+
+- `weight_name`: specifies the weight file to load if the file was saved in the 🤗 Diffusers format with a specific name or if the file is stored in the A1111 format
+- `token`: specifies the special word to use in the prompt to trigger the embeddings
+
+Let's load the [sayakpaul/EasyNegative-test](https://huggingface.co/sayakpaul/EasyNegative-test) embeddings:
+
+```py
+pipeline.load_textual_inversion(
+    "sayakpaul/EasyNegative-test", weight_name="EasyNegative.safetensors", token="EasyNegative"
+)
+```
+
+Now you can use the `token` to generate an image with the negative embeddings:
+
+```py
+prompt = "A cute brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration, EasyNegative"
+negative_prompt = "EasyNegative"
+
+image = pipeline(prompt, negative_prompt=negative_prompt, num_inference_steps=50).images[0]
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png" />
+</div>
+
+## LoRA
+
+[Low-Rank Adaptation (LoRA)](https://huggingface.co/papers/2106.09685) is a popular training technique because it is fast and generates smaller file sizes (a couple hundred MBs). Like the other methods in this guide, LoRA can train a model to learn new styles from just a few images. It works by inserting new weights into the diffusion model and then only the new weights are trained instead of the entire model. This makes LoRAs faster to train and easier to store.
+
+<Tip>
+
+LoRA is a very general training technique that can be used with other training methods. For example, it is common to train a model with DreamBooth and LoRA. It is also increasingly common to load and merge multiple LoRAs to create new and unique images. You can learn more about it in the in-depth [Merge LoRAs](merge_loras) guide since merging is outside the scope of this loading guide.
+
+</Tip>
+
+LoRAs also need to be used with another model:
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+
+pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
+```
+
+Then use the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method to load the [ostris/super-cereal-sdxl-lora](https://huggingface.co/ostris/super-cereal-sdxl-lora) weights and specify the weights filename from the repository:
+
+```py
+pipeline.load_lora_weights("ostris/super-cereal-sdxl-lora", weight_name="cereal_box_sdxl_v1.safetensors")
+prompt = "bears, pizza bites"
+image = pipeline(prompt).images[0]
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_lora.png" />
+</div>
+
+The [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method loads LoRA weights into both the UNet and text encoder. It is the preferred way for loading LoRAs because it can handle cases where:
+
+- the LoRA weights don't have separate identifiers for the UNet and text encoder
+- the LoRA weights have separate identifiers for the UNet and text encoder
+
+To directly load (and save) a LoRA adapter at the *model-level*, use [`~loaders.PeftAdapterMixin.load_lora_adapter`], which builds and prepares the necessary model configuration for the adapter. Like [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`], [`~loaders.PeftAdapterMixin.load_lora_adapter`] can load LoRAs for both the UNet and text encoder. For example, if you're loading a LoRA for the UNet, [`~loaders.PeftAdapterMixin.load_lora_adapter`] ignores the keys for the text encoder.
+
+Use the `weight_name` parameter to specify the specific weight file and the `prefix` parameter to filter for the appropriate state dicts (`"unet"` in this case) to load.
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+
+pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
+pipeline.unet.load_lora_adapter("jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", prefix="unet")
+
+# use cnmt in the prompt to trigger the LoRA
+prompt = "A cute cnmt eating a slice of pizza, stunning color scheme, masterpiece, illustration"
+image = pipeline(prompt).images[0]
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_attn_proc.png" />
+</div>
+
+Save an adapter with [`~loaders.PeftAdapterMixin.save_lora_adapter`].
+
+To unload the LoRA weights, use the [`~loaders.StableDiffusionLoraLoaderMixin.unload_lora_weights`] method to discard the LoRA weights and restore the model to its original weights:
+
+```py
+pipeline.unload_lora_weights()
+```
+
+### Adjust LoRA weight scale
+
+For both [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] and [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`], you can pass the `cross_attention_kwargs={"scale": 0.5}` parameter to adjust how much of the LoRA weights to use. A value of `0` is the same as only using the base model weights, and a value of `1` is equivalent to using the fully finetuned LoRA.
+
+For more granular control on the amount of LoRA weights used per layer, you can use [`~loaders.StableDiffusionLoraLoaderMixin.set_adapters`] and pass a dictionary specifying by how much to scale the weights in each layer by.
+```python
+pipe = ... # create pipeline
+pipe.load_lora_weights(..., adapter_name="my_adapter")
+scales = {
+    "text_encoder": 0.5,
+    "text_encoder_2": 0.5,  # only usable if pipe has a 2nd text encoder
+    "unet": {
+        "down": 0.9,  # all transformers in the down-part will use scale 0.9
+        # "mid"  # in this example "mid" is not given, therefore all transformers in the mid part will use the default scale 1.0
+        "up": {
+            "block_0": 0.6,  # all 3 transformers in the 0th block in the up-part will use scale 0.6
+            "block_1": [0.4, 0.8, 1.0],  # the 3 transformers in the 1st block in the up-part will use scales 0.4, 0.8 and 1.0 respectively
+        }
+    }
+}
+pipe.set_adapters("my_adapter", scales)
+```
+
+This also works with multiple adapters - see [this guide](https://huggingface.co/docs/diffusers/tutorials/using_peft_for_inference#customize-adapters-strength) for how to do it.
+
+<Tip warning={true}>
+
+Currently, [`~loaders.StableDiffusionLoraLoaderMixin.set_adapters`] only supports scaling attention weights. If a LoRA has other parts (e.g., resnets or down-/upsamplers), they will keep a scale of 1.0.
+
+</Tip>
+
+### Hotswapping LoRA adapters
+
+A common use case when serving multiple adapters is to load one adapter first, generate images, load another adapter, generate more images, load another adapter, etc. This workflow normally requires calling [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`], [`~loaders.StableDiffusionLoraLoaderMixin.set_adapters`], and possibly [`~loaders.peft.PeftAdapterMixin.delete_adapters`] to save memory. Moreover, if the model is compiled using `torch.compile`, performing these steps requires recompilation, which takes time.
+
+To better support this common workflow, you can "hotswap" a LoRA adapter, to avoid accumulating memory and in some cases, recompilation. It requires an adapter to already be loaded, and the new adapter weights are swapped in-place for the existing adapter.
+
+Pass `hotswap=True` when loading a LoRA adapter to enable this feature. It is important to indicate the name of the existing adapter, (`default_0` is the default adapter name), to be swapped. If you loaded the first adapter with a different name, use that name instead.
+
+```python
+pipe = ...
+# load adapter 1 as normal
+pipeline.load_lora_weights(file_name_adapter_1)
+# generate some images with adapter 1
+...
+# now hot swap the 2nd adapter
+pipeline.load_lora_weights(file_name_adapter_2, hotswap=True, adapter_name="default_0")
+# generate images with adapter 2
+```
+
+
+<Tip warning={true}>
+
+Hotswapping is not currently supported for LoRA adapters that target the text encoder.
+
+</Tip>
+
+For compiled models, it is often (though not always if the second adapter targets identical LoRA ranks and scales) necessary to call [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`] to avoid recompilation. Use [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`] _before_ loading the first adapter, and `torch.compile` should be called _after_ loading the first adapter.
+
+```python
+pipe = ...
+# call this extra method
+pipe.enable_lora_hotswap(target_rank=max_rank)
+# now load adapter 1
+pipe.load_lora_weights(file_name_adapter_1)
+# now compile the unet of the pipeline
+pipe.unet = torch.compile(pipeline.unet, ...)
+# generate some images with adapter 1
+...
+# now hot swap adapter 2
+pipeline.load_lora_weights(file_name_adapter_2, hotswap=True, adapter_name="default_0")
+# generate images with adapter 2
+```
+
+The `target_rank=max_rank` argument is important for setting the maximum rank among all LoRA adapters that will be loaded. If you have one adapter with rank 8 and another with rank 16, pass `target_rank=16`. You should use a higher value if in doubt. By default, this value is 128.
+
+However, there can be situations where recompilation is unavoidable. For example, if the hotswapped adapter targets more layers than the initial adapter, then recompilation is triggered. Try to load the adapter that targets the most layers first. Refer to the PEFT docs on [hotswapping](https://huggingface.co/docs/peft/main/en/package_reference/hotswap#peft.utils.hotswap.hotswap_adapter) for more details about the limitations of this feature.
+
+<Tip>
+
+Move your code inside the `with torch._dynamo.config.patch(error_on_recompile=True)` context manager to detect if a model was recompiled. If you detect recompilation despite following all the steps above, please open an issue with [Diffusers](https://github.com/huggingface/diffusers/issues) with a reproducible example.
+
+</Tip>
+
+### Kohya and TheLastBen
+
+Other popular LoRA trainers from the community include those by [Kohya](https://github.com/kohya-ss/sd-scripts/) and [TheLastBen](https://github.com/TheLastBen/fast-stable-diffusion). These trainers create different LoRA checkpoints than those trained by 🤗 Diffusers, but they can still be loaded in the same way.
+
+<hfoptions id="other-trainers">
+<hfoption id="Kohya">
+
+To load a Kohya LoRA, let's download the [Blueprintify SD XL 1.0](https://civitai.com/models/150986/blueprintify-sd-xl-10) checkpoint from [Civitai](https://civitai.com/) as an example:
+
+```sh
+!wget https://civitai.com/api/download/models/168776 -O blueprintify-sd-xl-10.safetensors
+```
+
+Load the LoRA checkpoint with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method, and specify the filename in the `weight_name` parameter:
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+
+pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
+pipeline.load_lora_weights("path/to/weights", weight_name="blueprintify-sd-xl-10.safetensors")
+```
+
+Generate an image:
+
+```py
+# use bl3uprint in the prompt to trigger the LoRA
+prompt = "bl3uprint, a highly detailed blueprint of the eiffel tower, explaining how to build all parts, many txt, blueprint grid backdrop"
+image = pipeline(prompt).images[0]
+image
+```
+
+<Tip warning={true}>
+
+Some limitations of using Kohya LoRAs with 🤗 Diffusers include:
+
+- Images may not look like those generated by UIs - like ComfyUI - for multiple reasons, which are explained [here](https://github.com/huggingface/diffusers/pull/4287/#issuecomment-1655110736).
+- [LyCORIS checkpoints](https://github.com/KohakuBlueleaf/LyCORIS) aren't fully supported. The [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method loads LyCORIS checkpoints with LoRA and LoCon modules, but Hada and LoKR are not supported.
+
+</Tip>
+
+</hfoption>
+<hfoption id="TheLastBen">
+
+Loading a checkpoint from TheLastBen is very similar. For example, to load the [TheLastBen/William_Eggleston_Style_SDXL](https://huggingface.co/TheLastBen/William_Eggleston_Style_SDXL) checkpoint:
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+
+pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
+pipeline.load_lora_weights("TheLastBen/William_Eggleston_Style_SDXL", weight_name="wegg.safetensors")
+
+# use by william eggleston in the prompt to trigger the LoRA
+prompt = "a house by william eggleston, sunrays, beautiful, sunlight, sunrays, beautiful"
+image = pipeline(prompt=prompt).images[0]
+image
+```
+
+</hfoption>
+</hfoptions>
+
+## IP-Adapter
+
+[IP-Adapter](https://ip-adapter.github.io/) is a lightweight adapter that enables image prompting for any diffusion model. This adapter works by decoupling the cross-attention layers of the image and text features. All the other model components are frozen and only the embedded image features in the UNet are trained. As a result, IP-Adapter files are typically only ~100MBs.
+
+You can learn more about how to use IP-Adapter for different tasks and specific use cases in the [IP-Adapter](../using-diffusers/ip_adapter) guide.
+
+> [!TIP]
+> Diffusers currently only supports IP-Adapter for some of the most popular pipelines. Feel free to open a feature request if you have a cool use case and want to integrate IP-Adapter with an unsupported pipeline!
+> Official IP-Adapter checkpoints are available from [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter).
+
+To start, load a Stable Diffusion checkpoint.
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+from diffusers.utils import load_image
+
+pipeline = AutoPipelineForText2Image.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
+```
+
+Then load the IP-Adapter weights and add it to the pipeline with the [`~loaders.IPAdapterMixin.load_ip_adapter`] method.
+
+```py
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
+```
+
+Once loaded, you can use the pipeline with an image and text prompt to guide the image generation process.
+
+```py
+image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png")
+generator = torch.Generator(device="cpu").manual_seed(33)
+images = pipeline(
+    prompt='best quality, high quality, wearing sunglasses',
+    ip_adapter_image=image,
+    negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
+    num_inference_steps=50,
+    generator=generator,
+).images[0]
+images
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip-bear.png" />
+</div>
+
+### IP-Adapter Plus
+
+IP-Adapter relies on an image encoder to generate image features. If the IP-Adapter repository contains an `image_encoder` subfolder, the image encoder is automatically loaded and registered to the pipeline. Otherwise, you'll need to explicitly load the image encoder with a [`~transformers.CLIPVisionModelWithProjection`] model and pass it to the pipeline.
+
+This is the case for *IP-Adapter Plus* checkpoints which use the ViT-H image encoder.
+
+```py
+from transformers import CLIPVisionModelWithProjection
+
+image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+    "h94/IP-Adapter",
+    subfolder="models/image_encoder",
+    torch_dtype=torch.float16
+)
+
+pipeline = AutoPipelineForText2Image.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    image_encoder=image_encoder,
+    torch_dtype=torch.float16
+).to("cuda")
+
+pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter-plus_sdxl_vit-h.safetensors")
+```
+
+### IP-Adapter Face ID models
+
+The IP-Adapter FaceID models are experimental IP Adapters that use image embeddings generated by `insightface` instead of CLIP image embeddings. Some of these models also use LoRA to improve ID consistency.
+You need to install `insightface` and all its requirements to use these models.
+
+<Tip warning={true}>
+As InsightFace pretrained models are available for non-commercial research purposes, IP-Adapter-FaceID models are released exclusively for research purposes and are not intended for commercial use.
+</Tip>
+
+```py
+pipeline = AutoPipelineForText2Image.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16
+).to("cuda")
+
+pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid_sdxl.bin", image_encoder_folder=None)
+```
+
+If you want to use one of the two IP-Adapter FaceID Plus models, you must also load the CLIP image encoder, as this models use both `insightface` and CLIP image embeddings to achieve better photorealism.
+
+```py
+from transformers import CLIPVisionModelWithProjection
+
+image_encoder = CLIPVisionModelWithProjection.from_pretrained(
+    "laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
+    torch_dtype=torch.float16,
+)
+
+pipeline = AutoPipelineForText2Image.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    image_encoder=image_encoder,
+    torch_dtype=torch.float16
+).to("cuda")
+
+pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid-plus_sd15.bin")
+```
@@ -0,0 +1,266 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Merge LoRAs
+
+It can be fun and creative to use multiple [LoRAs]((https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora)) together to generate something entirely new and unique. This works by merging multiple LoRA weights together to produce images that are a blend of different styles. Diffusers provides a few methods to merge LoRAs depending on *how* you want to merge their weights, which can affect image quality.
+
+This guide will show you how to merge LoRAs using the [`~loaders.PeftAdapterMixin.set_adapters`] and [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) methods. To improve inference speed and reduce memory-usage of merged LoRAs, you'll also see how to use the [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method to fuse the LoRA weights with the original weights of the underlying model.
+
+For this guide, load a Stable Diffusion XL (SDXL) checkpoint and the [KappaNeuro/studio-ghibli-style](https://huggingface.co/KappaNeuro/studio-ghibli-style) and [Norod78/sdxl-chalkboarddrawing-lora](https://huggingface.co/Norod78/sdxl-chalkboarddrawing-lora) LoRAs with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method. You'll need to assign each LoRA an `adapter_name` to combine them later.
+
+```py
+from diffusers import DiffusionPipeline
+import torch
+
+pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
+pipeline.load_lora_weights("ostris/ikea-instructions-lora-sdxl", weight_name="ikea_instructions_xl_v1_5.safetensors", adapter_name="ikea")
+pipeline.load_lora_weights("lordjia/by-feng-zikai", weight_name="fengzikai_v1.0_XL.safetensors", adapter_name="feng")
+```
+
+## set_adapters
+
+The [`~loaders.PeftAdapterMixin.set_adapters`] method merges LoRA adapters by concatenating their weighted matrices. Use the adapter name to specify which LoRAs to merge, and the `adapter_weights` parameter to control the scaling for each LoRA. For example, if `adapter_weights=[0.5, 0.5]`, then the merged LoRA output is an average of both LoRAs. Try adjusting the adapter weights to see how it affects the generated image!
+
+```py
+pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
+
+generator = torch.manual_seed(0)
+prompt = "A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai"
+image = pipeline(prompt, generator=generator, cross_attention_kwargs={"scale": 1.0}).images[0]
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lora_merge_set_adapters.png"/>
+</div>
+
+## add_weighted_adapter
+
+> [!WARNING]
+> This is an experimental method that adds PEFTs [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) method to Diffusers to enable more efficient merging methods. Check out this [issue](https://github.com/huggingface/diffusers/issues/6892) if you're interested in learning more about the motivation and design behind this integration.
+
+The [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) method provides access to more efficient merging method such as [TIES and DARE](https://huggingface.co/docs/peft/developer_guides/model_merging). To use these merging methods, make sure you have the latest stable version of Diffusers and PEFT installed.
+
+```bash
+pip install -U diffusers peft
+```
+
+There are three steps to merge LoRAs with the [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) method:
+
+1. Create a [PeftModel](https://huggingface.co/docs/peft/package_reference/peft_model#peft.PeftModel) from the underlying model and LoRA checkpoint.
+2. Load a base UNet model and the LoRA adapters.
+3. Merge the adapters using the [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) method and the merging method of your choice.
+
+Let's dive deeper into what these steps entail.
+
+1. Load a UNet that corresponds to the UNet in the LoRA checkpoint. In this case, both LoRAs use the SDXL UNet as their base model.
+
+```python
+from diffusers import AutoModel
+import torch
+
+unet = AutoModel.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+    variant="fp16",
+    subfolder="unet",
+).to("cuda")
+```
+
+Load the SDXL pipeline and the LoRA checkpoints, starting with the [ostris/ikea-instructions-lora-sdxl](https://huggingface.co/ostris/ikea-instructions-lora-sdxl) LoRA.
+
+```python
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    variant="fp16",
+    torch_dtype=torch.float16,
+    unet=unet
+).to("cuda")
+pipeline.load_lora_weights("ostris/ikea-instructions-lora-sdxl", weight_name="ikea_instructions_xl_v1_5.safetensors", adapter_name="ikea")
+```
+
+Now you'll create a [PeftModel](https://huggingface.co/docs/peft/package_reference/peft_model#peft.PeftModel) from the loaded LoRA checkpoint by combining the SDXL UNet and the LoRA UNet from the pipeline.
+
+```python
+from peft import get_peft_model, LoraConfig
+import copy
+
+sdxl_unet = copy.deepcopy(unet)
+ikea_peft_model = get_peft_model(
+    sdxl_unet,
+    pipeline.unet.peft_config["ikea"],
+    adapter_name="ikea"
+)
+
+original_state_dict = {f"base_model.model.{k}": v for k, v in pipeline.unet.state_dict().items()}
+ikea_peft_model.load_state_dict(original_state_dict, strict=True)
+```
+
+> [!TIP]
+> You can optionally push the ikea_peft_model to the Hub by calling `ikea_peft_model.push_to_hub("ikea_peft_model", token=TOKEN)`.
+
+Repeat this process to create a [PeftModel](https://huggingface.co/docs/peft/package_reference/peft_model#peft.PeftModel) from the [lordjia/by-feng-zikai](https://huggingface.co/lordjia/by-feng-zikai) LoRA.
+
+```python
+pipeline.delete_adapters("ikea")
+sdxl_unet.delete_adapters("ikea")
+
+pipeline.load_lora_weights("lordjia/by-feng-zikai", weight_name="fengzikai_v1.0_XL.safetensors", adapter_name="feng")
+pipeline.set_adapters(adapter_names="feng")
+
+feng_peft_model = get_peft_model(
+    sdxl_unet,
+    pipeline.unet.peft_config["feng"],
+    adapter_name="feng"
+)
+
+original_state_dict = {f"base_model.model.{k}": v for k, v in pipe.unet.state_dict().items()}
+feng_peft_model.load_state_dict(original_state_dict, strict=True)
+```
+
+2. Load a base UNet model and then load the adapters onto it.
+
+```python
+from peft import PeftModel
+
+base_unet = AutoModel.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    torch_dtype=torch.float16,
+    use_safetensors=True,
+    variant="fp16",
+    subfolder="unet",
+).to("cuda")
+
+model = PeftModel.from_pretrained(base_unet, "stevhliu/ikea_peft_model", use_safetensors=True, subfolder="ikea", adapter_name="ikea")
+model.load_adapter("stevhliu/feng_peft_model", use_safetensors=True, subfolder="feng", adapter_name="feng")
+```
+
+3. Merge the adapters using the [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) method and the merging method of your choice (learn more about other merging methods in this [blog post](https://huggingface.co/blog/peft_merging)). For this example, let's use the `"dare_linear"` method to merge the LoRAs.
+
+> [!WARNING]
+> Keep in mind the LoRAs need to have the same rank to be merged!
+
+```python
+model.add_weighted_adapter(
+    adapters=["ikea", "feng"],
+    weights=[1.0, 1.0],
+    combination_type="dare_linear",
+    adapter_name="ikea-feng"
+)
+model.set_adapters("ikea-feng")
+```
+
+Now you can generate an image with the merged LoRA.
+
+```python
+model = model.to(dtype=torch.float16, device="cuda")
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", unet=model, variant="fp16", torch_dtype=torch.float16,
+).to("cuda")
+
+image = pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai", generator=torch.manual_seed(0)).images[0]
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ikea-feng-dare-linear.png"/>
+</div>
+
+## fuse_lora
+
+Both the [`~loaders.PeftAdapterMixin.set_adapters`] and [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) methods require loading the base model and the LoRA adapters separately which incurs some overhead. The [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] method allows you to fuse the LoRA weights directly with the original weights of the underlying model. This way, you're only loading the model once which can increase inference and lower memory-usage.
+
+You can use PEFT to easily fuse/unfuse multiple adapters directly into the model weights (both UNet and text encoder) using the [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] method, which can lead to a speed-up in inference and lower VRAM usage.
+
+For example, if you have a base model and adapters loaded and set as active with the following adapter weights:
+
+```py
+from diffusers import DiffusionPipeline
+import torch
+
+pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
+pipeline.load_lora_weights("ostris/ikea-instructions-lora-sdxl", weight_name="ikea_instructions_xl_v1_5.safetensors", adapter_name="ikea")
+pipeline.load_lora_weights("lordjia/by-feng-zikai", weight_name="fengzikai_v1.0_XL.safetensors", adapter_name="feng")
+
+pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
+```
+
+Fuse these LoRAs into the UNet with the [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] method. The `lora_scale` parameter controls how much to scale the output by with the LoRA weights. It is important to make the `lora_scale` adjustments in the [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] method because it won’t work if you try to pass `scale` to the `cross_attention_kwargs` in the pipeline.
+
+```py
+pipeline.fuse_lora(adapter_names=["ikea", "feng"], lora_scale=1.0)
+```
+
+Then you should use [`~loaders.StableDiffusionLoraLoaderMixin.unload_lora_weights`] to unload the LoRA weights since they've already been fused with the underlying base model. Finally, call [`~DiffusionPipeline.save_pretrained`] to save the fused pipeline locally or you could call [`~DiffusionPipeline.push_to_hub`] to push the fused pipeline to the Hub.
+
+```py
+pipeline.unload_lora_weights()
+# save locally
+pipeline.save_pretrained("path/to/fused-pipeline")
+# save to the Hub
+pipeline.push_to_hub("fused-ikea-feng")
+```
+
+Now you can quickly load the fused pipeline and use it for inference without needing to separately load the LoRA adapters.
+
+```py
+pipeline = DiffusionPipeline.from_pretrained(
+    "username/fused-ikea-feng", torch_dtype=torch.float16,
+).to("cuda")
+
+image = pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai", generator=torch.manual_seed(0)).images[0]
+image
+```
+
+You can call [`~~loaders.lora_base.LoraBaseMixin.unfuse_lora`] to restore the original model's weights (for example, if you want to use a different `lora_scale` value). However, this only works if you've only fused one LoRA adapter to the original model. If you've fused multiple LoRAs, you'll need to reload the model.
+
+```py
+pipeline.unfuse_lora()
+```
+
+### torch.compile
+
+[torch.compile](../optimization/torch2.0#torchcompile) can speed up your pipeline even more, but the LoRA weights must be fused first and then unloaded. Typically, the UNet is compiled because it is such a computationally intensive component of the pipeline.
+
+```py
+from diffusers import DiffusionPipeline
+import torch
+
+# load base model and LoRAs
+pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
+pipeline.load_lora_weights("ostris/ikea-instructions-lora-sdxl", weight_name="ikea_instructions_xl_v1_5.safetensors", adapter_name="ikea")
+pipeline.load_lora_weights("lordjia/by-feng-zikai", weight_name="fengzikai_v1.0_XL.safetensors", adapter_name="feng")
+
+# activate both LoRAs and set adapter weights
+pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
+
+# fuse LoRAs and unload weights
+pipeline.fuse_lora(adapter_names=["ikea", "feng"], lora_scale=1.0)
+pipeline.unload_lora_weights()
+
+# torch.compile
+pipeline.unet.to(memory_format=torch.channels_last)
+pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
+
+image = pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai", generator=torch.manual_seed(0)).images[0]
+```
+
+Learn more about torch.compile in the [Accelerate inference of text-to-image diffusion models](../tutorials/fast_diffusion#torchcompile) guide.
+
+## Next steps
+
+For more conceptual details about how each merging method works, take a look at the [🤗 PEFT welcomes new merging methods](https://huggingface.co/blog/peft_merging#concatenation-cat) blog post!
@@ -154,11 +154,11 @@ pipeline = AutoPipelineForInpainting.from_pretrained(
 pipeline.enable_model_cpu_offload()
 ```

-You can enable PAG on an existing inpainting pipeline like this
+You can enable PAG on an exisiting inpainting pipeline like this

 ```py
-pipeline_inpaint = AutoPipelineForInpainting.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
-pipeline = AutoPipelineForInpainting.from_pipe(pipeline_inpaint, enable_pag=True)
+pipeline_inpaint = AutoPipelineForInpaiting.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
+pipeline = AutoPipelineForInpaiting.from_pipe(pipeline_inpaint, enable_pag=True)
 ```

 This still works when your pipeline has a different task:
@@ -12,21 +12,41 @@ specific language governing permissions and limitations under the License.

 # T2I-Adapter

-[T2I-Adapter](https://huggingface.co/papers/2302.08453) is an adapter that enables controllable generation like [ControlNet](./controlnet). A T2I-Adapter works by learning a *mapping* between a control signal (for example, a depth map) and a pretrained model's internal knowledge. The adapter is plugged in to the base model to provide extra guidance based on the control signal during generation.
+[T2I-Adapter](https://hf.co/papers/2302.08453) is a lightweight adapter for controlling and providing more accurate
+structure guidance for text-to-image models. It works by learning an alignment between the internal knowledge of the
+text-to-image model and an external control signal, such as edge detection or depth estimation.

-Load a T2I-Adapter conditioned on a specific control, such as canny edge, and pass it to the pipeline in [`~DiffusionPipeline.from_pretrained`].
+The T2I-Adapter design is simple, the condition is passed to four feature extraction blocks and three downsample
+blocks. This makes it fast and easy to train different adapters for different conditions which can be plugged into the
+text-to-image model. T2I-Adapter is similar to [ControlNet](controlnet) except it is smaller (~77M parameters) and
+faster because it only runs once during the diffusion process. The downside is that performance may be slightly worse
+than ControlNet.
+
+This guide will show you how to use T2I-Adapter with different Stable Diffusion models and how you can compose multiple
+T2I-Adapters to impose more than one condition.
+
+> [!TIP]
+> There are several T2I-Adapters available for different conditions, such as color palette, depth, sketch, pose, and
+> segmentation. Check out the [TencentARC](https://hf.co/TencentARC) repository to try them out!
+
+Before you begin, make sure you have the following libraries installed.

 ```py
-import torch
-from diffusers import T2IAdapter, StableDiffusionXLAdapterPipeline, AutoencoderKL
-
-t2i_adapter = T2IAdapter.from_pretrained(
-    "TencentARC/t2i-adapter-canny-sdxl-1.0",
-    torch_dtype=torch.float16,
-)
+# uncomment to install the necessary libraries in Colab
+#!pip install -q diffusers accelerate controlnet-aux==0.0.7
 ```

-Generate a canny image with [opencv-python](https://github.com/opencv/opencv-python).
+## Text-to-image
+
+Text-to-image models rely on a prompt to generate an image, but sometimes, text alone may not be enough to provide more
+accurate structural guidance. T2I-Adapter allows you to provide an additional control image to guide the generation
+process. For example, you can provide a canny image (a white outline of an image on a black background) to guide the
+model to generate an image with a similar structure.
+
+<hfoptions id="stablediffusion">
+<hfoption id="Stable Diffusion 1.5">
+
+Create a canny image with the [opencv-library](https://github.com/opencv/opencv-python).

 ```py
 import cv2
@@ -34,124 +54,166 @@ import numpy as np
 from PIL import Image
 from diffusers.utils import load_image

-original_image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png"
-)
-
-image = np.array(original_image)
+image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png")
+image = np.array(image)

 low_threshold = 100
 high_threshold = 200

 image = cv2.Canny(image, low_threshold, high_threshold)
-image = image[:, :, None]
-image = np.concatenate([image, image, image], axis=2)
-canny_image = Image.fromarray(image)
+image = Image.fromarray(image)
 ```

-Pass the canny image to the pipeline to generate an image.
-
-```py
-vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
-pipeline = StableDiffusionXLAdapterPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    adapter=t2i_adapter,
-    vae=vae,
-    torch_dtype=torch.float16,
-).to("cuda")
-
-prompt = """
-A photorealistic overhead image of a cat reclining sideways in a flamingo pool floatie holding a margarita. 
-The cat is floating leisurely in the pool and completely relaxed and happy.
-"""
-
-pipeline(
-    prompt, 
-    image=canny_image,
-    num_inference_steps=100, 
-    guidance_scale=10,
-).images[0]
-```
-
-<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png" width="300" alt="Generated image (prompt only)"/>
-    <figcaption style="text-align: center;">original image</figcaption>
-  </figure>
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png" width="300" alt="Control image (Canny edges)"/>
-    <figcaption style="text-align: center;">canny image</figcaption>
-  </figure>
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-canny-cat-generated.png" width="300" alt="Generated image (ControlNet + prompt)"/>
-    <figcaption style="text-align: center;">generated image</figcaption>
-  </figure>
-</div>
-
-## MultiAdapter
-
-You can compose multiple controls, such as canny image and a depth map, with the [`MultiAdapter`] class.
-
-The example below composes a canny image and depth map.
-
-Load the control images and T2I-Adapters as a list.
+Now load a T2I-Adapter conditioned on [canny images](https://hf.co/TencentARC/t2iadapter_canny_sd15v2) and pass it to
+the [`StableDiffusionAdapterPipeline`].

 ```py
 import torch
-from diffusers.utils import load_image
-from diffusers import StableDiffusionXLAdapterPipeline, AutoencoderKL, MultiAdapter, T2IAdapter
+from diffusers import StableDiffusionAdapterPipeline, T2IAdapter

-canny_image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png"
+adapter = T2IAdapter.from_pretrained("TencentARC/t2iadapter_canny_sd15v2", torch_dtype=torch.float16)
+pipeline = StableDiffusionAdapterPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    adapter=adapter,
+    torch_dtype=torch.float16,
+)
+pipeline.to("cuda")
+```
+
+Finally, pass your prompt and control image to the pipeline.
+
+```py
+generator = torch.Generator("cuda").manual_seed(0)
+
+image = pipeline(
+    prompt="cinematic photo of a plush and soft midcentury style rug on a wooden floor, 35mm photograph, film, professional, 4k, highly detailed",
+    image=image,
+    generator=generator,
+).images[0]
+image
+```
+
+<div class="flex justify-center">
+  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-sd1.5.png"/>
+</div>
+
+</hfoption>
+<hfoption id="Stable Diffusion XL">
+
+Create a canny image with the [controlnet-aux](https://github.com/huggingface/controlnet_aux) library.
+
+```py
+from controlnet_aux.canny import CannyDetector
+from diffusers.utils import load_image
+
+canny_detector = CannyDetector()
+
+image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png")
+image = canny_detector(image, detect_resolution=384, image_resolution=1024)
+```
+
+Now load a T2I-Adapter conditioned on [canny images](https://hf.co/TencentARC/t2i-adapter-canny-sdxl-1.0) and pass it
+to the [`StableDiffusionXLAdapterPipeline`].
+
+```py
+import torch
+from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, EulerAncestralDiscreteScheduler, AutoencoderKL
+
+scheduler = EulerAncestralDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler")
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
+adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16)
+pipeline = StableDiffusionXLAdapterPipeline.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0",
+    adapter=adapter,
+    vae=vae,
+    scheduler=scheduler,
+    torch_dtype=torch.float16,
+    variant="fp16",
+)
+pipeline.to("cuda")
+```
+
+Finally, pass your prompt and control image to the pipeline.
+
+```py
+generator = torch.Generator("cuda").manual_seed(0)
+
+image = pipeline(
+  prompt="cinematic photo of a plush and soft midcentury style rug on a wooden floor, 35mm photograph, film, professional, 4k, highly detailed",
+  image=image,
+  generator=generator,
+).images[0]
+image
+```
+
+<div class="flex justify-center">
+  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-sdxl.png"/>
+</div>
+
+</hfoption>
+</hfoptions>
+
+## MultiAdapter
+
+T2I-Adapters are also composable, allowing you to use more than one adapter to impose multiple control conditions on an
+image. For example, you can use a pose map to provide structural control and a depth map for depth control. This is
+enabled by the [`MultiAdapter`] class.
+
+Let's condition a text-to-image model with a pose and depth adapter. Create and place your depth and pose image and in a list.
+
+```py
+from diffusers.utils import load_image
+
+pose_image = load_image(
+    "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"
 )
 depth_image = load_image(
-    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_depth_image.png"
+    "https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"
 )
-controls = [canny_image, depth_image]
-prompt = ["""
-a relaxed rabbit sitting on a striped towel next to a pool with a tropical drink nearby, 
-bright sunny day, vacation scene, 35mm photograph, film, professional, 4k, highly detailed
-"""]
+cond = [pose_image, depth_image]
+prompt = ["Santa Claus walking into an office room with a beautiful city view"]
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">depth image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">pose image</figcaption>
+  </div>
+</div>
+
+Load the corresponding pose and depth adapters as a list in the [`MultiAdapter`] class.
+
+```py
+import torch
+from diffusers import StableDiffusionAdapterPipeline, MultiAdapter, T2IAdapter

 adapters = MultiAdapter(
    [
-        T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16),
-        T2IAdapter.from_pretrained("TencentARC/t2i-adapter-depth-midas-sdxl-1.0", torch_dtype=torch.float16),
+        T2IAdapter.from_pretrained("TencentARC/t2iadapter_keypose_sd14v1"),
+        T2IAdapter.from_pretrained("TencentARC/t2iadapter_depth_sd14v1"),
    ]
 )
+adapters = adapters.to(torch.float16)
 ```

-Pass the adapters, prompt, and control images to [`StableDiffusionXLAdapterPipeline`]. Use the `adapter_conditioning_scale` parameter to determine how much weight to assign to each control.
+Finally, load a [`StableDiffusionAdapterPipeline`] with the adapters, and pass your prompt and conditioned images to
+it. Use the [`adapter_conditioning_scale`] to adjust the weight of each adapter on the image.

 ```py
-vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
-pipeline = StableDiffusionXLAdapterPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
+pipeline = StableDiffusionAdapterPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
    torch_dtype=torch.float16,
-    vae=vae,
    adapter=adapters,
 ).to("cuda")

-pipeline(
-    prompt,
-    image=controls,
-    height=1024,
-    width=1024,
-    adapter_conditioning_scale=[0.7, 0.7]
-).images[0]
+image = pipeline(prompt, cond, adapter_conditioning_scale=[0.7, 0.7]).images[0]
+image
 ```

-<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png" width="300" alt="Generated image (prompt only)"/>
-    <figcaption style="text-align: center;">canny image</figcaption>
-  </figure>
-  <figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_depth_image.png" width="300" alt="Control image (Canny edges)"/>
-    <figcaption style="text-align: center;">depth map</figcaption>
-  </figure>
-  <figure> 
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-multi-rabbit.png" width="300" alt="Generated image (ControlNet + prompt)"/>
-    <figcaption style="text-align: center;">generated image</figcaption>
-  </figure>
+<div class="flex justify-center">
+  <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-multi.png"/>
 </div>
@@ -10,56 +10,109 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Textual Inversion
+# Textual inversion

-[Textual Inversion](https://huggingface.co/papers/2208.01618) is a method for generating personalized images of a concept. It works by fine-tuning a models word embeddings on 3-5 images of the concept (for example, pixel art) that is associated with a unique token (`<sks>`). This allows you to use the `<sks>` token in your prompt to trigger the model to generate pixel art images.
+[[open-in-colab]]

-Textual Inversion weights are very lightweight and typically only a few KBs because they're only word embeddings. However, this also means the word embeddings need to be loaded after loading a model with [`~DiffusionPipeline.from_pretrained`].
+The [`StableDiffusionPipeline`] supports textual inversion, a technique that enables a model like Stable Diffusion to learn a new concept from just a few sample images. This gives you more control over the generated images and allows you to tailor the model towards specific concepts. You can get started quickly with a collection of community created concepts in the [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer).
+
+This guide will show you how to run inference with textual inversion using a pre-learned concept from the Stable Diffusion Conceptualizer. If you're interested in teaching a model new concepts with textual inversion, take a look at the [Textual Inversion](../training/text_inversion) training guide.
+
+Import the necessary libraries:

 ```py
 import torch
-from diffusers import AutoPipelineForText2Image
-
-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5",
-    torch_dtype=torch.float16
-).to("cuda")
+from diffusers import StableDiffusionPipeline
+from diffusers.utils import make_image_grid
 ```

-Load the word embeddings with [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] and include the unique token in the prompt to activate its generation.
+## Stable Diffusion 1 and 2
+
+Pick a Stable Diffusion checkpoint and a pre-learned concept from the [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer):

 ```py
-pipeline.load_textual_inversion("sd-concepts-library/gta5-artwork")
-prompt = "A cute brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration, <gta5-artwork> style"
-pipeline(prompt).images[0]
+pretrained_model_name_or_path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
+repo_id_embeds = "sd-concepts-library/cat-toy"
+```
+
+Now you can load a pipeline, and pass the pre-learned concept to it:
+
+```py
+pipeline = StableDiffusionPipeline.from_pretrained(
+    pretrained_model_name_or_path, torch_dtype=torch.float16, use_safetensors=True
+).to("cuda")
+
+pipeline.load_textual_inversion(repo_id_embeds)
+```
+
+Create a prompt with the pre-learned concept by using the special placeholder token `<cat-toy>`, and choose the number of samples and rows of images you'd like to generate:
+
+```py
+prompt = "a grafitti in a favela wall with a <cat-toy> on it"
+
+num_samples_per_row = 2
+num_rows = 2
+```
+
+Then run the pipeline (feel free to adjust the parameters like `num_inference_steps` and `guidance_scale` to see how they affect image quality), save the generated images and visualize them with the helper function you created at the beginning:
+
+```py
+all_images = []
+for _ in range(num_rows):
+    images = pipeline(prompt, num_images_per_prompt=num_samples_per_row, num_inference_steps=50, guidance_scale=7.5).images
+    all_images.extend(images)
+
+grid = make_image_grid(all_images, num_rows, num_samples_per_row)
+grid
 ```

 <div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_txt_embed.png" />
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/textual_inversion_inference.png">
 </div>

-Textual Inversion can also be trained to learn *negative embeddings* to steer generation away from unwanted characteristics such as "blurry" or "ugly". It is useful for improving image quality.
+## Stable Diffusion XL

-EasyNegative is a widely used negative embedding that contains multiple learned negative concepts. Load the negative embeddings and specify the file name and token associated with the negative embeddings. Pass the token to `negative_prompt` in your pipeline to activate it.
+Stable Diffusion XL (SDXL) can also use textual inversion vectors for inference. In contrast to Stable Diffusion 1 and 2, SDXL has two text encoders so you'll need two textual inversion embeddings - one for each text encoder model.
+
+Let's download the SDXL textual inversion embeddings and have a closer look at it's structure:

 ```py
-import torch
-from diffusers import AutoPipelineForText2Image
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file

-pipeline = AutoPipelineForText2Image.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5",
-    torch_dtype=torch.float16
-).to("cuda")
-pipeline.load_textual_inversion(
-    "EvilEngine/easynegative",
-    weight_name="easynegative.safetensors",
-    token="easynegative"
-)
-prompt = "A cute brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration"
-negative_prompt = "easynegative"
-pipeline(prompt, negative_prompt).images[0]
+file = hf_hub_download("dn118/unaestheticXL", filename="unaestheticXLv31.safetensors")
+state_dict = load_file(file)
+state_dict
 ```

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png" />
-</div>
+```
+{'clip_g': tensor([[ 0.0077, -0.0112,  0.0065,  ...,  0.0195,  0.0159,  0.0275],
+         ...,
+         [-0.0170,  0.0213,  0.0143,  ..., -0.0302, -0.0240, -0.0362]],
+ 'clip_l': tensor([[ 0.0023,  0.0192,  0.0213,  ..., -0.0385,  0.0048, -0.0011],
+         ...,
+         [ 0.0475, -0.0508, -0.0145,  ...,  0.0070, -0.0089, -0.0163]],
+```
+
+There are two tensors, `"clip_g"` and `"clip_l"`.
+`"clip_g"` corresponds to the bigger text encoder in SDXL and refers to
+`pipe.text_encoder_2` and `"clip_l"` refers to `pipe.text_encoder`.
+
+Now you can load each tensor separately by passing them along with the correct text encoder and tokenizer
+to [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`]:
+
+```py
+from diffusers import AutoPipelineForText2Image
+import torch
+
+pipe = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", variant="fp16", torch_dtype=torch.float16)
+pipe.to("cuda")
+
+pipe.load_textual_inversion(state_dict["clip_g"], token="unaestheticXLv31", text_encoder=pipe.text_encoder_2, tokenizer=pipe.tokenizer_2)
+pipe.load_textual_inversion(state_dict["clip_l"], token="unaestheticXLv31", text_encoder=pipe.text_encoder, tokenizer=pipe.tokenizer)
+
+# the embedding should be used as a negative embedding, so we pass it as a negative prompt
+generator = torch.Generator().manual_seed(33)
+image = pipe("a woman standing in front of a mountain", negative_prompt="unaestheticXLv31", generator=generator).images[0]
+image
+```
@@ -125,7 +125,7 @@ Now we'll simply specify the name of the dataset and caption column (in this cas
 ```

 You can also load a dataset straight from by specifying it's name in `dataset_name`.
-Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loading your own caption dataset.
+Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loadin your own caption dataset.

 - **optimizer**: for this example, we'll use [prodigy](https://huggingface.co/blog/sdxl_lora_advanced_script#adaptive-optimizers) - an adaptive optimizer
 - **pivotal tuning**
@@ -404,7 +404,7 @@ The advanced script now supports custom choice of U-net blocks to train during D
 > In light of this, we're introducing a new feature to the advanced script to allow for configurable U-net learned blocks.

 **Usage**
-Configure LoRA learned U-net blocks adding a `lora_unet_blocks` flag, with a comma separated string specifying the targeted blocks.
+Configure LoRA learned U-net blocks adding a `lora_unet_blocks` flag, with a comma seperated string specifying the targeted blocks.
 e.g:
 ```bash
 --lora_unet_blocks="unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1"
@@ -141,7 +141,7 @@ Now we'll simply specify the name of the dataset and caption column (in this cas
 ```

 You can also load a dataset straight from by specifying it's name in `dataset_name`.
-Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loading your own caption dataset.
+Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loadin your own caption dataset.

 - **optimizer**: for this example, we'll use [prodigy](https://huggingface.co/blog/sdxl_lora_advanced_script#adaptive-optimizers) - an adaptive optimizer
 - **pivotal tuning**
@@ -430,9 +430,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--with_prior_preservation",
        default=False,
@@ -773,15 +770,6 @@ def parse_args(input_args=None):
        ),
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument(
-        "--image_interpolation_mode",
-        type=str,
-        default="lanczos",
-        choices=[
-            f.lower() for f in dir(transforms.InterpolationMode) if not f.startswith("__") and not f.endswith("__")
-        ],
-        help="The image interpolation method to use for resizing images.",
-    )

    if input_args is not None:
        args = parser.parse_args(input_args)
@@ -1046,10 +1034,7 @@ class DreamBoothDataset(Dataset):
            self.instance_images.extend(itertools.repeat(img, repeats))

        self.pixel_values = []
-        interpolation = getattr(transforms.InterpolationMode, args.image_interpolation_mode.upper(), None)
-        if interpolation is None:
-            raise ValueError(f"Unsupported interpolation mode {interpolation=}.")
-        train_resize = transforms.Resize(size, interpolation=interpolation)
+        train_resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)
        train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size)
        train_flip = transforms.RandomHorizontalFlip(p=1.0)
        train_transforms = transforms.Compose(
@@ -1093,7 +1078,7 @@ class DreamBoothDataset(Dataset):

        self.image_transforms = transforms.Compose(
            [
-                transforms.Resize(size, interpolation=interpolation),
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
                transforms.ToTensor(),
                transforms.Normalize([0.5], [0.5]),
@@ -1557,7 +1542,6 @@ def main(args):
    transformer_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        init_lora_weights="gaussian",
        target_modules=target_modules,
    )
@@ -1566,7 +1550,6 @@ def main(args):
        text_lora_config = LoraConfig(
            r=args.rank,
            lora_alpha=args.rank,
-            lora_dropout=args.lora_dropout,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
        )
@@ -658,8 +658,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--use_dora",
        action="store_true",
@@ -675,15 +673,6 @@ def parse_args(input_args=None):
        default=False,
        help="Cache the VAE latents",
    )
-    parser.add_argument(
-        "--image_interpolation_mode",
-        type=str,
-        default="lanczos",
-        choices=[
-            f.lower() for f in dir(transforms.InterpolationMode) if not f.startswith("__") and not f.endswith("__")
-        ],
-        help="The image interpolation method to use for resizing images.",
-    )

    if input_args is not None:
        args = parser.parse_args(input_args)
@@ -918,10 +907,6 @@ class DreamBoothDataset(Dataset):
        self.num_instance_images = len(self.instance_images)
        self._length = self.num_instance_images

-        interpolation = getattr(transforms.InterpolationMode, args.image_interpolation_mode.upper(), None)
-        if interpolation is None:
-            raise ValueError(f"Unsupported interpolation mode {interpolation=}.")
-
        if class_data_root is not None:
            self.class_data_root = Path(class_data_root)
            self.class_data_root.mkdir(parents=True, exist_ok=True)
@@ -936,7 +921,7 @@ class DreamBoothDataset(Dataset):

        self.image_transforms = transforms.Compose(
            [
-                transforms.Resize(size, interpolation=interpolation),
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
                transforms.ToTensor(),
                transforms.Normalize([0.5], [0.5]),
@@ -1250,7 +1235,6 @@ def main(args):
    unet_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        use_dora=args.use_dora,
        init_lora_weights="gaussian",
        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
@@ -1263,7 +1247,6 @@ def main(args):
        text_lora_config = LoraConfig(
            r=args.rank,
            lora_alpha=args.rank,
-            lora_dropout=args.lora_dropout,
            use_dora=args.use_dora,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
@@ -767,9 +767,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--use_dora",
        action="store_true",
@@ -802,15 +799,6 @@ def parse_args(input_args=None):
        default=False,
        help="Cache the VAE latents",
    )
-    parser.add_argument(
-        "--image_interpolation_mode",
-        type=str,
-        default="lanczos",
-        choices=[
-            f.lower() for f in dir(transforms.InterpolationMode) if not f.startswith("__") and not f.endswith("__")
-        ],
-        help="The image interpolation method to use for resizing images.",
-    )

    if input_args is not None:
        args = parser.parse_args(input_args)
@@ -1081,10 +1069,7 @@ class DreamBoothDataset(Dataset):
        self.original_sizes = []
        self.crop_top_lefts = []
        self.pixel_values = []
-        interpolation = getattr(transforms.InterpolationMode, args.image_interpolation_mode.upper(), None)
-        if interpolation is None:
-            raise ValueError(f"Unsupported interpolation mode {interpolation=}.")
-        train_resize = transforms.Resize(size, interpolation=interpolation)
+        train_resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)
        train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size)
        train_flip = transforms.RandomHorizontalFlip(p=1.0)
        train_transforms = transforms.Compose(
@@ -1161,7 +1146,7 @@ class DreamBoothDataset(Dataset):

        self.image_transforms = transforms.Compose(
            [
-                transforms.Resize(size, interpolation=interpolation),
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
                transforms.ToTensor(),
                transforms.Normalize([0.5], [0.5]),
@@ -1561,7 +1546,6 @@ def main(args):
        r=args.rank,
        use_dora=args.use_dora,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        init_lora_weights="gaussian",
        target_modules=target_modules,
    )
@@ -1574,7 +1558,6 @@ def main(args):
            r=args.rank,
            use_dora=args.use_dora,
            lora_alpha=args.rank,
-            lora_dropout=args.lora_dropout,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
        )
@@ -1,6 +1,6 @@
 ## Amused training

-Amused can be finetuned on simple datasets relatively cheaply and quickly. Using 8bit optimizers, lora, and gradient accumulation, amused can be finetuned with as little as 5.5 GB. Here are a set of examples for finetuning amused on some relatively simple datasets. These training recipes are aggressively oriented towards minimal resources and fast verification -- i.e. the batch sizes are quite low and the learning rates are quite high. For optimal quality, you will probably want to increase the batch sizes and decrease learning rates.
+Amused can be finetuned on simple datasets relatively cheaply and quickly. Using 8bit optimizers, lora, and gradient accumulation, amused can be finetuned with as little as 5.5 GB. Here are a set of examples for finetuning amused on some relatively simple datasets. These training recipies are aggressively oriented towards minimal resources and fast verification -- i.e. the batch sizes are quite low and the learning rates are quite high. For optimal quality, you will probably want to increase the batch sizes and decrease learning rates.

 All training examples use fp16 mixed precision and gradient checkpointing. We don't show 8 bit adam + lora as its about the same memory use as just using lora (bitsandbytes uses full precision optimizer states for weights below a minimum size).

@@ -201,7 +201,7 @@ Note that setting the `<ID_TOKEN>` is not necessary. From some limited experimen
 > - The original repository uses a `lora_alpha` of `1`. We found this not suitable in many runs, possibly due to difference in modeling backends and training settings. Our recommendation is to set to the `lora_alpha` to either `rank` or `rank // 2`.
 > - If you're training on data whose captions generate bad results with the original model, a `rank` of 64 and above is good and also the recommendation by the team behind CogVideoX. If the generations are already moderately good on your training captions, a `rank` of 16/32 should work. We found that setting the rank too low, say `4`, is not ideal and doesn't produce promising results.
 > - The authors of CogVideoX recommend 4000 training steps and 100 training videos overall to achieve the best result. While that might yield the best results, we found from our limited experimentation that 2000 steps and 25 videos could also be sufficient.
-> - When using the Prodigy optimizer for training, one can follow the recommendations from [this](https://huggingface.co/blog/sdxl_lora_advanced_script) blog. Prodigy tends to overfit quickly. From my very limited testing, I found a learning rate of `0.5` to be suitable in addition to `--prodigy_use_bias_correction`, `prodigy_safeguard_warmup` and `--prodigy_decouple`.
+> - When using the Prodigy opitimizer for training, one can follow the recommendations from [this](https://huggingface.co/blog/sdxl_lora_advanced_script) blog. Prodigy tends to overfit quickly. From my very limited testing, I found a learning rate of `0.5` to be suitable in addition to `--prodigy_use_bias_correction`, `prodigy_safeguard_warmup` and `--prodigy_decouple`.
 > - The recommended learning rate by the CogVideoX authors and from our experimentation with Adam/AdamW is between `1e-3` and `1e-4` for a dataset of 25+ videos.
 >
 > Note that our testing is not exhaustive due to limited time for exploration. Our recommendation would be to play around with the different knobs and dials to find the best settings for your data.
@@ -879,7 +879,7 @@ def prepare_rotary_positional_embeddings(


 def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
-    # Use DeepSpeed optimizer
+    # Use DeepSpeed optimzer
    if use_deepspeed:
        from accelerate.utils import DummyOptim

@@ -901,7 +901,7 @@ def prepare_rotary_positional_embeddings(


 def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
-    # Use DeepSpeed optimizer
+    # Use DeepSpeed optimzer
    if use_deepspeed:
        from accelerate.utils import DummyOptim

@@ -4865,7 +4865,7 @@ python -m pip install intel_extension_for_pytorch
 ```
 python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
 ```
-2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX acceleration. Supported inference datatypes are Float32 and BFloat16.
+2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX accelaration. Supported inference datatypes are Float32 and BFloat16.

 ```python
 pipe = AnimateDiffPipelineIpex.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device)
@@ -336,13 +336,13 @@ if __name__ == "__main__":
                    expanded_kernel_width = np.ceil(kernel_width) + 2

                    # Determine a set of field_of_view for each each output position, these are the pixels in the input image
-                    # that the pixel in the output image 'sees'. We get a matrix whose horizontal dim is the output pixels (big) and the
+                    # that the pixel in the output image 'sees'. We get a matrix whos horizontal dim is the output pixels (big) and the
                    # vertical dim is the pixels it 'sees' (kernel_size + 2)
                    field_of_view = np.squeeze(
                        np.int16(np.expand_dims(left_boundary, axis=1) + np.arange(expanded_kernel_width) - 1)
                    )

-                    # Assign weight to each pixel in the field of view. A matrix whose horizontal dim is the output pixels and the
+                    # Assign weight to each pixel in the field of view. A matrix whos horizontal dim is the output pixels and the
                    # vertical dim is a list of weights matching to the pixel in the field of view (that are specified in
                    # 'field_of_view')
                    weights = fixed_kernel(1.0 * np.expand_dims(match_coordinates, axis=1) - field_of_view - 1)
@@ -201,16 +201,16 @@ class PAIntAAttnProcessor:
        # ================================================== #
        # We use a hack by running the code from the BasicTransformerBlock that is between Self and Cross attentions here
        # The other option would've been modifying the BasicTransformerBlock and adding this functionality here.
-        # I assumed that changing the BasicTransformerBlock would have been a bigger deal and decided to use this hack instead.
+        # I assumed that changing the BasicTransformerBlock would have been a bigger deal and decided to use this hack isntead.

-        # The SelfAttention block receives the normalized latents from the BasicTransformerBlock,
+        # The SelfAttention block recieves the normalized latents from the BasicTransformerBlock,
        # But the residual of the output is the non-normalized version.
        # Therefore we unnormalize the input hidden state here
        unnormalized_input_hidden_states = (
            input_hidden_states + self.transformer_block.norm1.bias
        ) * self.transformer_block.norm1.weight

-        # TODO: return if necessary
+        # TODO: return if neccessary
        # if self.use_ada_layer_norm_zero:
        #     attn_output = gate_msa.unsqueeze(1) * attn_output
        # elif self.use_ada_layer_norm_single:
@@ -220,7 +220,7 @@ class PAIntAAttnProcessor:
        if transformer_hidden_states.ndim == 4:
            transformer_hidden_states = transformer_hidden_states.squeeze(1)

-        # TODO: return if necessary
+        # TODO: return if neccessary
        # 2.5 GLIGEN Control
        # if gligen_kwargs is not None:
        #     transformer_hidden_states = self.fuser(transformer_hidden_states, gligen_kwargs["objs"])
@@ -266,7 +266,7 @@ class PAIntAAttnProcessor:
            ) = cross_attention_input_hidden_states.chunk(2)

            # Same split for the encoder_hidden_states i.e. the tokens
-            # Since the SelfAttention processors don't get the encoder states as input, we inject them into the processor in the beginning.
+            # Since the SelfAttention processors don't get the encoder states as input, we inject them into the processor in the begining.
            _encoder_hidden_states_unconditional, encoder_hidden_states_conditional = self.encoder_hidden_states.chunk(
                2
            )
@@ -896,7 +896,7 @@ class StableDiffusionHDPainterPipeline(StableDiffusionInpaintPipeline):
 class GaussianSmoothing(nn.Module):
    """
    Apply gaussian smoothing on a
-    1d, 2d or 3d tensor. Filtering is performed separately for each channel
+    1d, 2d or 3d tensor. Filtering is performed seperately for each channel
    in the input using a depthwise convolution.

    Args:
@@ -161,7 +161,7 @@ class ImageToImageInpaintingPipeline(DiffusionPipeline):
                `Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
                be masked out with `mask_image` and repainted according to `prompt`.
            inner_image (`torch.Tensor` or `PIL.Image.Image`):
-                `Image`, or tensor representing an image batch which will be overlaid onto `image`. Non-transparent
+                `Image`, or tensor representing an image batch which will be overlayed onto `image`. Non-transparent
                regions of `inner_image` must fit inside white pixels in `mask_image`. Expects four channels, with
                the last channel representing the alpha channel, which will be used to blend `inner_image` with
                `image`. If not provided, it will be forcibly cast to RGBA.
@@ -647,7 +647,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
        return sample

    def set_timesteps(
-        self, strength, num_inference_steps: int, lcm_origin_steps: int, device: Union[str, torch.device] = None
+        self, stength, num_inference_steps: int, lcm_origin_steps: int, device: Union[str, torch.device] = None
    ):
        """
        Sets the discrete timesteps used for the diffusion chain (to be run before inference).
@@ -668,7 +668,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
        # LCM Timesteps Setting:  # Linear Spacing
        c = self.config.num_train_timesteps // lcm_origin_steps
        lcm_origin_timesteps = (
-            np.asarray(list(range(1, int(lcm_origin_steps * strength) + 1))) * c - 1
+            np.asarray(list(range(1, int(lcm_origin_steps * stength) + 1))) * c - 1
        )  # LCM Training  Steps Schedule
        skipping_step = len(lcm_origin_timesteps) // num_inference_steps
        timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps]  # LCM Inference Steps Schedule
@@ -129,7 +129,7 @@ class MagicMixPipeline(DiffusionPipeline):

                    input = (
                        (mix_factor * latents) + (1 - mix_factor) * orig_latents
-                    )  # interpolating between layout noise and conditionally generated noise to preserve layout semantics
+                    )  # interpolating between layout noise and conditionally generated noise to preserve layout sematics
                    input = torch.cat([input] * 2)

                else:  # content generation phase
@@ -196,9 +196,9 @@ class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixi
            guidance_scale_tiles: specific weights for classifier-free guidance in each tile.
            guidance_scale_tiles: specific weights for classifier-free guidance in each tile. If None, the value provided in guidance_scale will be used.
            seed_tiles: specific seeds for the initialization latents in each tile. These will override the latents generated for the whole canvas using the standard seed parameter.
-            seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overridden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overridden.
-            seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overridden using the given seed. Takes priority over seed_tiles.
-            cpu_vae: the decoder from latent space to pixel space can require too much GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues.
+            seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overriden.
+            seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overriden using the given seed. Takes priority over seed_tiles.
+            cpu_vae: the decoder from latent space to pixel space can require too mucho GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues.

        Examples:

@@ -1258,7 +1258,7 @@ class KolorsControlNetPipeline(
                    )

                    if guess_mode and self.do_classifier_free_guidance:
-                        # Inferred ControlNet only for the conditional batch.
+                        # Infered ControlNet only for the conditional batch.
                        # To apply the output of ControlNet to both the unconditional and conditional batches,
                        # add 0 to the unconditional batch to keep it unchanged.
                        down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -1462,7 +1462,7 @@ class KolorsControlNetImg2ImgPipeline(
                    )

                    if guess_mode and self.do_classifier_free_guidance:
-                        # Inferred ControlNet only for the conditional batch.
+                        # Infered ControlNet only for the conditional batch.
                        # To apply the output of ControlNet to both the unconditional and conditional batches,
                        # add 0 to the unconditional batch to keep it unchanged.
                        down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -1782,7 +1782,7 @@ class KolorsControlNetInpaintPipeline(
                    )

                    if guess_mode and self.do_classifier_free_guidance:
-                        # Inferred ControlNet only for the conditional batch.
+                        # Infered ControlNet only for the conditional batch.
                        # To apply the output of ControlNet to both the unconditional and conditional batches,
                        # add 0 to the unconditional batch to keep it unchanged.
                        down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
@@ -559,7 +559,7 @@ class FabricPipeline(DiffusionPipeline):
                End point for providing feedback (between 0 and 1).
            min_weight (`float`, *optional*, defaults to `.05`):
                Minimum weight for feedback.
-            max_weight (`float`, *optional*, defaults tp `1.0`):
+            max_weight (`float`, *optional*, defults tp `1.0`):
                Maximum weight for feedback.
            neg_scale (`float`, *optional*, defaults to `.5`):
                Scale factor for negative feedback.
@@ -118,7 +118,7 @@ EXAMPLE_DOC_STRING = """
        >>> # Here we need use pipeline internal unet model
        >>> pipe.unet = pipe.unet_model.from_pretrained(model_id, subfolder="unet", variant="fp16", use_safetensors=True)
        >>>
-        >>> # Load additional layers to the model
+        >>> # Load aditional layers to the model
        >>> pipe.unet.load_additional_layers(weight_path="proc_data/faithdiff/FaithDiff.bin", dtype=dtype)
        >>>
        >>> # Enable vae tiling
@@ -72,7 +72,7 @@ class GaussianSmoothing(nn.Module):
    """
    Copied from official repo: https://github.com/showlab/BoxDiff/blob/master/utils/gaussian_smoothing.py
    Apply gaussian smoothing on a
-    1d, 2d or 3d tensor. Filtering is performed separately for each channel
+    1d, 2d or 3d tensor. Filtering is performed seperately for each channel
    in the input using a depthwise convolution.
    Arguments:
        channels (int, sequence): Number of channels of the input tensors. Output will
@@ -1509,7 +1509,7 @@ class StableDiffusionXL_AE_Pipeline(

        add_time_ids = add_time_ids.repeat(batch_size, 1).to(DEVICE)

-        # interactive sampling
+        # interative sampling
        self.scheduler.set_timesteps(num_inference_steps)
        latents_list = [latents]
        pred_x0_list = []
@@ -1548,7 +1548,7 @@ class StableDiffusionXL_AE_Pipeline(
        x: torch.FloatTensor,
    ):
        """
-        predict the sample the next step in the denoise process.
+        predict the sampe the next step in the denoise process.
        """
        ref_noise = model_output[:1, :, :, :].expand(model_output.shape)
        alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
@@ -132,7 +132,7 @@ def _preprocess_adapter_image(image, height, width):
            image = torch.cat(image, dim=0)
        else:
            raise ValueError(
-                f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but receive: {image[0].ndim}"
+                f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but recive: {image[0].ndim}"
            )
    return image

@@ -150,7 +150,7 @@ def _preprocess_adapter_image(image, height, width):
            image = torch.cat(image, dim=0)
        else:
            raise ValueError(
-                f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but receive: {image[0].ndim}"
+                f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but recive: {image[0].ndim}"
            )
    return image

@@ -220,7 +220,7 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
            revers = True

            def pcallback(s_self, step: int, timestep: int, latents: torch.Tensor, selfs=None):
-                if "PRO" in mode:  # in Prompt mode, make masks from sum of attention maps
+                if "PRO" in mode:  # in Prompt mode, make masks from sum of attension maps
                    self.step = step

                    if len(self.attnmaps_sizes) > 3:
@@ -552,9 +552,9 @@ def get_attn_maps(self, attn):

 def reset_attnmaps(self):  # init parameters in every batch
    self.step = 0
-    self.attnmaps = {}  # made from attention maps
+    self.attnmaps = {}  # maked from attention maps
    self.attnmaps_sizes = []  # height,width set of u-net blocks
-    self.attnmasks = {}  # made from attnmaps for regions
+    self.attnmasks = {}  # maked from attnmaps for regions
    self.maskready = False
    self.history = {}

@@ -97,7 +97,7 @@ class SdeDragPipeline(DiffusionPipeline):
            steps (`int`, *optional*, defaults to 200):
                The number of sampling iterations.
            step_size (`int`, *optional*, defaults to 2):
-                The drag distance of each drag step.
+                The drag diatance of each drag step.
            image_scale (`float`, *optional*, defaults to 0.3):
                To avoid duplicating the content, use image_scale to perturbs the source.
            adapt_radius (`int`, *optional*, defaults to 5):
@@ -284,7 +284,7 @@ class UnCLIPImageInterpolationPipeline(DiffusionPipeline):
                )
        else:
            raise AssertionError(
-                f"Expected 'image' or 'image_embeddings' to be not None with types List[PIL.Image] or torch.Tensor respectively. Received {type(image)} and {type(image_embeddings)} respectively"
+                f"Expected 'image' or 'image_embeddings' to be not None with types List[PIL.Image] or torch.Tensor respectively. Received {type(image)} and {type(image_embeddings)} repsectively"
            )

        original_image_embeddings = self._encode_image(
@@ -1012,7 +1012,7 @@ def main(args):
    unet = get_peft_model(unet, lora_config)

    # 9. Handle mixed precision and device placement
-    # For mixed precision training we cast all non-trainable weights to half-precision
+    # For mixed precision training we cast all non-trainable weigths to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
@@ -829,7 +829,7 @@ def main(args):
        )

    # 8. Handle mixed precision and device placement
-    # For mixed precision training we cast all non-trainable weights to half-precision
+    # For mixed precision training we cast all non-trainable weigths to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
@@ -1026,7 +1026,7 @@ def main(args):
    unet = get_peft_model(unet, lora_config)

    # 9. Handle mixed precision and device placement
-    # For mixed precision training we cast all non-trainable weights to half-precision
+    # For mixed precision training we cast all non-trainable weigths to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
@@ -962,7 +962,7 @@ def main(args):
        )

    # 9. Handle mixed precision and device placement
-    # For mixed precision training we cast all non-trainable weights to half-precision
+    # For mixed precision training we cast all non-trainable weigths to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
@@ -1021,7 +1021,7 @@ def main(args):
        )

    # 9. Handle mixed precision and device placement
-    # For mixed precision training we cast all non-trainable weights to half-precision
+    # For mixed precision training we cast all non-trainable weigths to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
@@ -411,7 +411,7 @@ export CAPTION_COLUMN='caption_column'

 export CACHE_DIR="/data/train_csr/.cache/huggingface/"
 export OUTPUT_DIR='/data/train_csr/FLUX/MODEL_OUT/'$MODEL_TYPE
-# The first step is to use Python to precompute all caches.Replace the first line below with this line. (I am not sure why using accelerate would cause problems.)
+# The first step is to use Python to precompute all caches.Replace the first line below with this line. (I am not sure why using acclerate would cause problems.)

 CUDA_VISIBLE_DEVICES=0 python3 train_controlnet_flux.py \

@@ -173,13 +173,13 @@ accelerate launch train_dreambooth_lora_flux.py \
 ### Target Modules
 When LoRA was first adapted from language models to diffusion models, it was applied to the cross-attention layers in the Unet that relate the image representations with the prompts that describe them. 
 More recently, SOTA text-to-image diffusion models replaced the Unet with a diffusion Transformer(DiT). With this change, we may also want to explore 
-applying LoRA training onto different types of layers and blocks. To allow more flexibility and control over the targeted modules we added `--lora_layers`- in which you can specify in a comma separated string
+applying LoRA training onto different types of layers and blocks. To allow more flexibility and control over the targeted modules we added `--lora_layers`- in which you can specify in a comma seperated string
 the exact modules for LoRA training. Here are some examples of target modules you can provide: 
 - for attention only layers: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0"`
 - to train the same modules as in the fal trainer: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2"`
 - to train the same modules as in ostris ai-toolkit / replicate trainer: `--lora_blocks="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2,norm1_context.linear, norm1.linear,norm.linear,proj_mlp,proj_out"`
 > [!NOTE]
-> `--lora_layers` can also be used to specify which **blocks** to apply LoRA training to. To do so, simply add a block prefix to each layer in the comma separated string:
+> `--lora_layers` can also be used to specify which **blocks** to apply LoRA training to. To do so, simply add a block prefix to each layer in the comma seperated string:
 > **single DiT blocks**: to target the ith single transformer block, add the prefix `single_transformer_blocks.i`, e.g. - `single_transformer_blocks.i.attn.to_k`
 > **MMDiT blocks**: to target the ith MMDiT block, add the prefix `transformer_blocks.i`, e.g. - `transformer_blocks.i.attn.to_k` 
 > [!NOTE]
@@ -107,7 +107,7 @@ To better track our training experiments, we're using the following flags in the

 Additionally, we welcome you to explore the following CLI arguments:

-* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
+* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
 * `--rank`: The rank of the LoRA layers. The higher the rank, the more parameters are trained. The default is 16.

 We provide several options for optimizing memory optimization:
@@ -117,30 +117,3 @@ We provide several options for optimizing memory optimization:
 * `--use_8bit_adam`: When enabled, we will use the 8bit version of AdamW provided by the `bitsandbytes` library.

 Refer to the [official documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/) of the `HiDreamImagePipeline` to know more about the model.
-
-## Using quantization
-
-You can quantize the base model with [`bitsandbytes`](https://huggingface.co/docs/bitsandbytes/index) to reduce memory usage. To do so, pass a JSON file path to `--bnb_quantization_config_path`. This file should hold the configuration to initialize `BitsAndBytesConfig`. Below is an example JSON file:
-
-```json
-{
-    "load_in_4bit": true,
-    "bnb_4bit_quant_type": "nf4"
-}
-```
-
-Below, we provide some numbers with and without the use of NF4 quantization when training:
-
-```
-(with quantization)
-Memory (before device placement): 9.085089683532715 GB.
-Memory (after device placement): 34.59585428237915 GB.
-Memory (after backward): 36.90267467498779 GB.
-
-(without quantization)
-Memory (before device placement): 0.0 GB.
-Memory (after device placement): 57.6400408744812 GB.
-Memory (after backward): 59.932212829589844 GB.
-```
-
-The reason why we see some memory before device placement in the case of quantization is because, by default bnb quantized models are placed on the GPU first.
@@ -113,7 +113,7 @@ To better track our training experiments, we're using the following flags in the

 Additionally, we welcome you to explore the following CLI arguments:

-* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
+* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
 * `--system_prompt`: A custom system prompt to provide additional personality to the model.
 * `--max_sequence_length`: Maximum sequence length to use for text embeddings.

@@ -113,7 +113,7 @@ To better track our training experiments, we're using the following flags in the

 Additionally, we welcome you to explore the following CLI arguments:

-* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
+* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
 * `--complex_human_instruction`: Instructions for complex human attention as shown in [here](https://github.com/NVlabs/Sana/blob/main/configs/sana_app_config/Sana_1600M_app.yaml#L55).
 * `--max_sequence_length`: Maximum sequence length to use for text embeddings.

@@ -1114,22 +1114,17 @@ def main(args):
    )

    # Scheduler and math around the number of training steps.
-    # Check the PR https://github.com/huggingface/diffusers/pull/8312 for detailed explanation.
-    num_warmup_steps_for_scheduler = args.lr_warmup_steps * accelerator.num_processes
+    overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps is None:
-        len_train_dataloader_after_sharding = math.ceil(len(train_dataloader) / accelerator.num_processes)
-        num_update_steps_per_epoch = math.ceil(len_train_dataloader_after_sharding / args.gradient_accumulation_steps)
-        num_training_steps_for_scheduler = (
-            args.num_train_epochs * accelerator.num_processes * num_update_steps_per_epoch
-        )
-    else:
-        num_training_steps_for_scheduler = args.max_train_steps * accelerator.num_processes
+        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
+        overrode_max_train_steps = True

    lr_scheduler = get_scheduler(
        args.lr_scheduler,
        optimizer=optimizer,
-        num_warmup_steps=num_warmup_steps_for_scheduler,
-        num_training_steps=num_training_steps_for_scheduler,
+        num_warmup_steps=args.lr_warmup_steps * accelerator.num_processes,
+        num_training_steps=args.max_train_steps * accelerator.num_processes,
        num_cycles=args.lr_num_cycles,
        power=args.lr_power,
    )
@@ -1161,14 +1156,8 @@ def main(args):

    # We need to recalculate our total training steps as the size of the training dataloader may have changed.
    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
-    if args.max_train_steps is None:
+    if overrode_max_train_steps:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
-        if num_training_steps_for_scheduler != args.max_train_steps:
-            logger.warning(
-                f"The length of the 'train_dataloader' after 'accelerator.prepare' ({len(train_dataloader)}) does not match "
-                f"the expected length ({len_train_dataloader_after_sharding}) when the learning rate scheduler was created. "
-                f"This inconsistency may result in the learning rate scheduler not functioning properly."
-            )
    # Afterwards we recalculate our number of training epochs
    args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

@@ -524,9 +524,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--image_interpolation_mode",
        type=str,
@@ -935,7 +932,6 @@ def main(args):
    unet_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        init_lora_weights="gaussian",
        target_modules=["to_k", "to_q", "to_v", "to_out.0", "add_k_proj", "add_v_proj"],
    )
@@ -946,7 +942,6 @@ def main(args):
        text_lora_config = LoraConfig(
            r=args.rank,
            lora_alpha=args.rank,
-            lora_dropout=args.lora_dropout,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
        )
@@ -358,9 +358,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--with_prior_preservation",
        default=False,
@@ -570,7 +567,7 @@ def parse_args(input_args=None):
        type=str,
        default=None,
        help=(
-            'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
+            'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
        ),
    )

@@ -1239,7 +1236,6 @@ def main(args):
    transformer_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        init_lora_weights="gaussian",
        target_modules=target_modules,
    )
@@ -1248,7 +1244,6 @@ def main(args):
        text_lora_config = LoraConfig(
            r=args.rank,
            lora_alpha=args.rank,
-            lora_dropout=args.lora_dropout,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
        )
@@ -16,7 +16,6 @@
 import argparse
 import copy
 import itertools
-import json
 import logging
 import math
 import os
@@ -28,13 +27,14 @@ from pathlib import Path

 import numpy as np
 import torch
+import torch.utils.checkpoint
 import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
 from huggingface_hub import create_repo, upload_folder
 from huggingface_hub.utils import insecure_hashlib
-from peft import LoraConfig, prepare_model_for_kbit_training, set_peft_model_state_dict
+from peft import LoraConfig, set_peft_model_state_dict
 from peft.utils import get_peft_model_state_dict
 from PIL import Image
 from PIL.ImageOps import exif_transpose
@@ -47,7 +47,6 @@ from transformers import AutoTokenizer, CLIPTokenizer, LlamaForCausalLM, Pretrai
 import diffusers
 from diffusers import (
    AutoencoderKL,
-    BitsAndBytesConfig,
    FlowMatchEulerDiscreteScheduler,
    HiDreamImagePipeline,
    HiDreamImageTransformer2DModel,
@@ -283,12 +282,6 @@ def parse_args(input_args=None):
        default="meta-llama/Meta-Llama-3.1-8B-Instruct",
        help="Path to pretrained model or model identifier from huggingface.co/models.",
    )
-    parser.add_argument(
-        "--bnb_quantization_config_path",
-        type=str,
-        default=None,
-        help="Quantization config in a JSON file that will be used to define the bitsandbytes quant config of the DiT.",
-    )
    parser.add_argument(
        "--revision",
        type=str,
@@ -417,9 +410,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--with_prior_preservation",
        default=False,
@@ -606,7 +596,7 @@ def parse_args(input_args=None):
        type=str,
        default=None,
        help=(
-            'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
+            'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
        ),
    )

@@ -1066,14 +1056,6 @@ def main(args):
        args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_3"
    )

-    # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
-    # as these weights are only used for inference, keeping weights in full precision is not required.
-    weight_dtype = torch.float32
-    if accelerator.mixed_precision == "fp16":
-        weight_dtype = torch.float16
-    elif accelerator.mixed_precision == "bf16":
-        weight_dtype = torch.bfloat16
-
    # Load scheduler and models
    noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
        args.pretrained_model_name_or_path, subfolder="scheduler", revision=args.revision, shift=3.0
@@ -1082,31 +1064,20 @@ def main(args):
    text_encoder_one, text_encoder_two, text_encoder_three, text_encoder_four = load_text_encoders(
        text_encoder_cls_one, text_encoder_cls_two, text_encoder_cls_three
    )
+
    vae = AutoencoderKL.from_pretrained(
        args.pretrained_model_name_or_path,
        subfolder="vae",
        revision=args.revision,
        variant=args.variant,
    )
-    quantization_config = None
-    if args.bnb_quantization_config_path is not None:
-        with open(args.bnb_quantization_config_path, "r") as f:
-            config_kwargs = json.load(f)
-            if "load_in_4bit" in config_kwargs and config_kwargs["load_in_4bit"]:
-                config_kwargs["bnb_4bit_compute_dtype"] = weight_dtype
-        quantization_config = BitsAndBytesConfig(**config_kwargs)
-
    transformer = HiDreamImageTransformer2DModel.from_pretrained(
        args.pretrained_model_name_or_path,
        subfolder="transformer",
        revision=args.revision,
        variant=args.variant,
-        quantization_config=quantization_config,
-        torch_dtype=weight_dtype,
        force_inference_output=True,
    )
-    if args.bnb_quantization_config_path is not None:
-        transformer = prepare_model_for_kbit_training(transformer, use_gradient_checkpointing=False)

    # We only train the additional adapter LoRA layers
    transformer.requires_grad_(False)
@@ -1116,6 +1087,14 @@ def main(args):
    text_encoder_three.requires_grad_(False)
    text_encoder_four.requires_grad_(False)

+    # For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
+    # as these weights are only used for inference, keeping weights in full precision is not required.
+    weight_dtype = torch.float32
+    if accelerator.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif accelerator.mixed_precision == "bf16":
+        weight_dtype = torch.bfloat16
+
    if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
        # due to pytorch#99272, MPS does not yet support bfloat16.
        raise ValueError(
@@ -1130,12 +1109,7 @@ def main(args):
    text_encoder_three.to(**to_kwargs)
    text_encoder_four.to(**to_kwargs)
    # we never offload the transformer to CPU, so we can just use the accelerator device
-    transformer_to_kwargs = (
-        {"device": accelerator.device}
-        if args.bnb_quantization_config_path is not None
-        else {"device": accelerator.device, "dtype": weight_dtype}
-    )
-    transformer.to(**transformer_to_kwargs)
+    transformer.to(accelerator.device, dtype=weight_dtype)

    # Initialize a text encoding pipeline and keep it to CPU for now.
    text_encoding_pipeline = HiDreamImagePipeline.from_pretrained(
@@ -1164,7 +1138,6 @@ def main(args):
    transformer_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        init_lora_weights="gaussian",
        target_modules=target_modules,
    )
@@ -1722,11 +1695,10 @@ def main(args):
    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
        transformer = unwrap_model(transformer)
-        if args.bnb_quantization_config_path is None:
-            if args.upcast_before_saving:
-                transformer.to(torch.float32)
-            else:
-                transformer = transformer.to(weight_dtype)
+        if args.upcast_before_saving:
+            transformer.to(torch.float32)
+        else:
+            transformer = transformer.to(weight_dtype)
        transformer_lora_layers = get_peft_model_state_dict(transformer)

        HiDreamImagePipeline.save_lora_weights(
@@ -328,9 +328,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--with_prior_preservation",
        default=False,
@@ -517,7 +514,7 @@ def parse_args(input_args=None):
        type=str,
        default=None,
        help=(
-            'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
+            'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
        ),
    )

@@ -602,15 +599,6 @@ def parse_args(input_args=None):
            "Defaults to precision dtype used for training to save memory"
        ),
    )
-    parser.add_argument(
-        "--image_interpolation_mode",
-        type=str,
-        default="lanczos",
-        choices=[
-            f.lower() for f in dir(transforms.InterpolationMode) if not f.startswith("__") and not f.endswith("__")
-        ],
-        help="The image interpolation method to use for resizing images.",
-    )
    parser.add_argument(
        "--offload",
        action="store_true",
@@ -736,11 +724,7 @@ class DreamBoothDataset(Dataset):
            self.instance_images.extend(itertools.repeat(img, repeats))

        self.pixel_values = []
-        interpolation = getattr(transforms.InterpolationMode, args.image_interpolation_mode.upper(), None)
-        if interpolation is None:
-            raise ValueError(f"Unsupported interpolation mode: {args.image_interpolation_mode}")
-
-        train_resize = transforms.Resize(size, interpolation=interpolation)
+        train_resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)
        train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size)
        train_flip = transforms.RandomHorizontalFlip(p=1.0)
        train_transforms = transforms.Compose(
@@ -784,7 +768,7 @@ class DreamBoothDataset(Dataset):

        self.image_transforms = transforms.Compose(
            [
-                transforms.Resize(size, interpolation=interpolation),
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
                transforms.ToTensor(),
                transforms.Normalize([0.5], [0.5]),
@@ -1026,7 +1010,6 @@ def main(args):
    transformer_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        init_lora_weights="gaussian",
        target_modules=target_modules,
    )
@@ -323,9 +323,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--with_prior_preservation",
        default=False,
@@ -516,7 +513,7 @@ def parse_args(input_args=None):
        type=str,
        default=None,
        help=(
-            'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
+            'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
        ),
    )

@@ -1024,7 +1021,6 @@ def main(args):
    transformer_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        init_lora_weights="gaussian",
        target_modules=target_modules,
    )
@@ -367,9 +367,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--with_prior_preservation",
        default=False,
@@ -579,7 +576,7 @@ def parse_args(input_args=None):
        type=str,
        default=None,
        help=(
-            "The transformer block layers to apply LoRA training on. Please specify the layers in a comma separated string."
+            "The transformer block layers to apply LoRA training on. Please specify the layers in a comma seperated string."
            "For examples refer to https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_SD3.md"
        ),
    )
@@ -588,7 +585,7 @@ def parse_args(input_args=None):
        type=str,
        default=None,
        help=(
-            "The transformer blocks to apply LoRA training on. Please specify the block numbers in a comma separated manner."
+            "The transformer blocks to apply LoRA training on. Please specify the block numbers in a comma seperated manner."
            'E.g. - "--lora_blocks 12,30" will result in lora training of transformer blocks 12 and 30. For more examples refer to https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_SD3.md'
        ),
    )
@@ -1267,7 +1264,6 @@ def main(args):
    transformer_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        init_lora_weights="gaussian",
        target_modules=target_modules,
    )
@@ -1277,7 +1273,6 @@ def main(args):
        text_lora_config = LoraConfig(
            r=args.rank,
            lora_alpha=args.rank,
-            lora_dropout=args.lora_dropout,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
        )
@@ -659,15 +659,12 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--use_dora",
        action="store_true",
        default=False,
        help=(
-            "Whether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
+            "Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
            "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
        ),
    )
@@ -855,7 +852,7 @@ class DreamBoothDataset(Dataset):

        self.image_transforms = transforms.Compose(
            [
-                transforms.Resize(size, interpolation=interpolation),
+                transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
                transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
                transforms.ToTensor(),
                transforms.Normalize([0.5], [0.5]),
@@ -1202,11 +1199,10 @@ def main(args):
            text_encoder_one.gradient_checkpointing_enable()
            text_encoder_two.gradient_checkpointing_enable()

-    def get_lora_config(rank, dropout, use_dora, target_modules):
+    def get_lora_config(rank, use_dora, target_modules):
        base_config = {
            "r": rank,
            "lora_alpha": rank,
-            "lora_dropout": dropout,
            "init_lora_weights": "gaussian",
            "target_modules": target_modules,
        }
@@ -1222,24 +1218,14 @@ def main(args):

    # now we will add new LoRA weights to the attention layers
    unet_target_modules = ["to_k", "to_q", "to_v", "to_out.0"]
-    unet_lora_config = get_lora_config(
-        rank=args.rank,
-        dropout=args.lora_dropout,
-        use_dora=args.use_dora,
-        target_modules=unet_target_modules,
-    )
+    unet_lora_config = get_lora_config(rank=args.rank, use_dora=args.use_dora, target_modules=unet_target_modules)
    unet.add_adapter(unet_lora_config)

    # The text encoder comes from 🤗 transformers, so we cannot directly modify it.
    # So, instead, we monkey-patch the forward calls of its attention-blocks.
    if args.train_text_encoder:
        text_target_modules = ["q_proj", "k_proj", "v_proj", "out_proj"]
-        text_lora_config = get_lora_config(
-            rank=args.rank,
-            dropout=args.lora_dropout,
-            use_dora=args.use_dora,
-            target_modules=text_target_modules,
-        )
+        text_lora_config = get_lora_config(rank=args.rank, use_dora=args.use_dora, target_modules=text_target_modules)
        text_encoder_one.add_adapter(text_lora_config)
        text_encoder_two.add_adapter(text_lora_config)

@@ -329,7 +329,7 @@ def parse_args(input_args=None):
        type=str,
        default=None,
        help=(
-            'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
+            'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
        ),
    )
    parser.add_argument(
@@ -400,7 +400,7 @@ def main():

    image_encoder.requires_grad_(False)

-    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
@@ -1147,7 +1147,7 @@ def main(args):
        tracker_config = dict(vars(args))
        accelerator.init_trackers(args.tracker_project_name, config=tracker_config)

-    # Function for unwrapping if torch.compile() was used in accelerate.
+    # Function for unwraping if torch.compile() was used in accelerate.
    def unwrap_model(model):
        model = accelerator.unwrap_model(model)
        model = model._orig_mod if is_compiled_module(model) else model
@@ -812,7 +812,7 @@ def main(args):

    if args.scale_lr:
        args.learning_rate = (
-            args.learning_rate * args.gradient_accumulation_steps * args.per_gpu_batch_size * accelerator.num_processes
+            args.learning_rat * args.gradient_accumulation_steps * args.per_gpu_batch_size * accelerator.num_processes
        )

    # Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
@@ -69,7 +69,7 @@ accelerate launch --config_file=accelerate.yaml \
  --seed="0"
 ```

-We can directly pass a quantized checkpoint path, too:
+We can direcly pass a quantized checkpoint path, too:

 ```diff
 + --quantized_model_path="hf-internal-testing/flux.1-dev-nf4-pkg"
@@ -13,7 +13,7 @@ args = parser.parse_args()


 device = "cpu"
-prompt = "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brightly buildings"
+prompt = "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brighly buildings"

 model_id = "path-to-your-trained-model"
 pipe = StableDiffusionPipeline.from_pretrained(model_id)
@@ -80,7 +80,7 @@ export INT8_MODEL_NAME="./int8_model"

 python text2images.py \
  --pretrained_model_name_or_path=$INT8_MODEL_NAME \
-  --caption "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brightly buildings." \
+  --caption "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brighly buildings." \
  --images_num 4
 ```

@@ -664,7 +664,7 @@ class PixArtAlphaControlnetPipeline(DiffusionPipeline):
        # &amp
        caption = re.sub(r"&amp", "", caption)

-        # ip addresses:
+        # ip adresses:
        caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)

        # article ids:
@@ -612,7 +612,7 @@ def main():
    # See Section 3.1. of the paper.
    max_length = 120

-    # For mixed precision training we cast all non-trainable weights (vae, text_encoder) to half-precision
+    # For mixed precision training we cast all non-trainable weigths (vae, text_encoder) to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
@@ -120,11 +120,11 @@ if __name__ == "__main__":
    parser.add_argument("--schnell", action="store_true", help="run flux schnell instead of dev")
    parser.add_argument("--width", type=int, default=1024, help="width of the image to generate")
    parser.add_argument("--height", type=int, default=1024, help="height of the image to generate")
-    parser.add_argument("--guidance", type=float, default=3.5, help="guidance strength for dev")
+    parser.add_argument("--guidance", type=float, default=3.5, help="gauidance strentgh for dev")
    parser.add_argument("--seed", type=int, default=None, help="seed for inference")
    parser.add_argument("--profile", action="store_true", help="enable profiling")
    parser.add_argument("--profile-duration", type=int, default=10000, help="duration for profiling in msec.")
-    parser.add_argument("--itters", type=int, default=15, help="items to run inference and get avg time in sec.")
+    parser.add_argument("--itters", type=int, default=15, help="tiems to run inference and get avg time in sec.")
    args = parser.parse_args()
    if args.schnell:
        ckpt_id = "black-forest-labs/FLUX.1-schnell"
@@ -759,7 +759,7 @@ def main(args):
        unet, text_encoder, optimizer, train_dataloader
    )

-    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
@@ -1,95 +0,0 @@
-# Training SANA Sprint Diffuser
-
-This README explains how to use the provided bash script commands to download a pre-trained teacher diffuser model and train it on a specific dataset, following the [SANA Sprint methodology](https://arxiv.org/abs/2503.09641).
-
-
-## Setup
-
-### 1. Define the local paths
-
-Set a variable for your desired output directory. This directory will store the downloaded model and the training checkpoints/results.
-
-```bash
-your_local_path='output' # Or any other path you prefer
-mkdir -p $your_local_path # Create the directory if it doesn't exist
-```
-
-### 2. Download the pre-trained model
-
-Download the SANA Sprint teacher model from Hugging Face Hub. The script uses the 1.6B parameter model.
-
-```bash
-huggingface-cli download Efficient-Large-Model/SANA_Sprint_1.6B_1024px_teacher_diffusers --local-dir $your_local_path/SANA_Sprint_1.6B_1024px_teacher_diffusers
-```
-
-*(Optional: You can also download the 0.6B model by replacing the model name: `Efficient-Large-Model/Sana_Sprint_0.6B_1024px_teacher_diffusers`)*
-
-### 3. Acquire the dataset shards
-
-The training script in this example uses specific `.parquet` shards from a randomly selected `brivangl/midjourney-v6-llava` dataset instead of downloading the entire dataset automatically via `dataset_name`.
-
-The script specifically uses these three files:
-*   `data/train_000.parquet`
-*   `data/train_001.parquet`
-*   `data/train_002.parquet`
-
-
-
-You can either:
-
-Let the script download the dataset automatically during first run
-
-Or download it manually
-
-**Note:** The full `brivangl/midjourney-v6-llava` dataset is much larger and contains many more shards. This script example explicitly trains *only* on the three specified shards.
-
-## Usage
-
-Once the model is downloaded, you can run the training script.
-
-```bash
-
-your_local_path='output' # Ensure this variable is set
-
-python train_sana_sprint_diffusers.py \
-    --pretrained_model_name_or_path=$your_local_path/SANA_Sprint_1.6B_1024px_teacher_diffusers \
-    --output_dir=$your_local_path \
-    --mixed_precision=bf16 \
-    --resolution=1024 \
-    --learning_rate=1e-6 \
-    --max_train_steps=30000 \
-    --dataloader_num_workers=8 \
-    --dataset_name='brivangl/midjourney-v6-llava' \
-    --file_path data/train_000.parquet data/train_001.parquet data/train_002.parquet \
-    --checkpointing_steps=500 --checkpoints_total_limit=10 \
-    --train_batch_size=1 \
-    --gradient_accumulation_steps=1 \
-    --seed=453645634 \
-    --train_largest_timestep \
-    --misaligned_pairs_D \
-    --gradient_checkpointing \
-    --resume_from_checkpoint="latest" \
-```
-
-### Explanation of parameters
-
-*   `--pretrained_model_name_or_path`: Path to the downloaded pre-trained model directory.
-*   `--output_dir`: Directory where training logs, checkpoints, and the final model will be saved.
-*   `--mixed_precision`: Use BF16 mixed precision for training, which can save memory and speed up training on compatible hardware.
-*   `--resolution`: The image resolution used for training (1024x1024).
-*   `--learning_rate`: The learning rate for the optimizer.
-*   `--max_train_steps`: The total number of training steps to perform.
-*   `--dataloader_num_workers`: Number of worker processes for loading data. Increase for faster data loading if your CPU and disk can handle it.
-*   `--dataset_name`: The name of the dataset on Hugging Face Hub (`brivangl/midjourney-v6-llava`).
-*   `--file_path`: **Specifies the local paths to the dataset shards to be used for training.** In this case, `data/train_000.parquet`, `data/train_001.parquet`, and `data/train_002.parquet`.
-*   `--checkpointing_steps`: Save a training checkpoint every X steps.
-*   `--checkpoints_total_limit`: Maximum number of checkpoints to keep. Older checkpoints will be deleted.
-*   `--train_batch_size`: The batch size per GPU.
-*   `--gradient_accumulation_steps`: Number of steps to accumulate gradients before performing an optimizer step.
-*   `--seed`: Random seed for reproducibility.
-*   `--train_largest_timestep`: A specific training strategy focusing on larger timesteps.
-*   `--misaligned_pairs_D`: Another specific training strategy to add misaligned image-text pairs as fake data for GAN.
-*   `--gradient_checkpointing`: Enable gradient checkpointing to save GPU memory.
-*   `--resume_from_checkpoint`: Allows resuming training from the latest saved checkpoint in the `--output_dir`.
-
-
@@ -1,26 +0,0 @@
-your_local_path='output'
-
-huggingface-cli download Efficient-Large-Model/SANA_Sprint_1.6B_1024px_teacher_diffusers  --local-dir $your_local_path/SANA_Sprint_1.6B_1024px_teacher_diffusers
-
-# or Sana_Sprint_0.6B_1024px_teacher_diffusers
-
-python train_sana_sprint_diffusers.py \
-    --pretrained_model_name_or_path=$your_local_path/SANA_Sprint_1.6B_1024px_teacher_diffusers \
-    --output_dir=$your_local_path \
-    --mixed_precision=bf16 \
-    --resolution=1024 \
-    --learning_rate=1e-6 \
-    --max_train_steps=30000 \
-    --dataloader_num_workers=8 \
-    --dataset_name='brivangl/midjourney-v6-llava' \
-    --file_path data/train_000.parquet data/train_001.parquet data/train_002.parquet \
-    --checkpointing_steps=500 --checkpoints_total_limit=10 \
-    --train_batch_size=1 \
-    --gradient_accumulation_steps=1 \
-    --seed=453645634 \
-    --train_largest_timestep \
-    --misaligned_pairs_D \
-    --gradient_checkpointing \
-    --resume_from_checkpoint="latest" \
-
-
@@ -661,7 +661,7 @@ def parse_args(input_args=None):
        action="store_true",
        default=False,
        help=(
-            "Whether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
+            "Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
            "Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
        ),
    )
@@ -480,15 +480,6 @@ def parse_args(input_args=None):
        action="store_true",
        help="debug loss for each image, if filenames are available in the dataset",
    )
-    parser.add_argument(
-        "--image_interpolation_mode",
-        type=str,
-        default="lanczos",
-        choices=[
-            f.lower() for f in dir(transforms.InterpolationMode) if not f.startswith("__") and not f.endswith("__")
-        ],
-        help="The image interpolation method to use for resizing images.",
-    )

    if input_args is not None:
        args = parser.parse_args(input_args)
@@ -922,14 +913,8 @@ def main(args):
        tokens_two = tokenize_prompt(tokenizer_two, captions)
        return tokens_one, tokens_two

-    # Get the specified interpolation method from the args
-    interpolation = getattr(transforms.InterpolationMode, args.image_interpolation_mode.upper(), None)
-
-    # Raise an error if the interpolation method is invalid
-    if interpolation is None:
-        raise ValueError(f"Unsupported interpolation mode {args.image_interpolation_mode}.")
    # Preprocessing the datasets.
-    train_resize = transforms.Resize(args.resolution, interpolation=interpolation)  # Use dynamic interpolation method
+    train_resize = transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR)
    train_crop = transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution)
    train_flip = transforms.RandomHorizontalFlip(p=1.0)
    train_transforms = transforms.Compose(
@@ -470,15 +470,6 @@ def parse_args(input_args=None):
        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
    )
    parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
-    parser.add_argument(
-        "--image_interpolation_mode",
-        type=str,
-        default="lanczos",
-        choices=[
-            f.lower() for f in dir(transforms.InterpolationMode) if not f.startswith("__") and not f.endswith("__")
-        ],
-        help="The image interpolation method to use for resizing images.",
-    )

    if input_args is not None:
        args = parser.parse_args(input_args)
@@ -870,10 +861,7 @@ def main(args):
            )

    # Preprocessing the datasets.
-    interpolation = getattr(transforms.InterpolationMode, args.image_interpolation_mode.upper(), None)
-    if interpolation is None:
-        raise ValueError(f"Unsupported interpolation mode {interpolation=}.")
-    train_resize = transforms.Resize(args.resolution, interpolation=interpolation)
+    train_resize = transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR)
    train_crop = transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution)
    train_flip = transforms.RandomHorizontalFlip(p=1.0)
    train_transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.5], [0.5])])
@@ -789,7 +789,7 @@ def main():
        text_encoder, optimizer, train_dataloader, lr_scheduler
    )

-    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
@@ -814,7 +814,7 @@ def main():
        text_encoder_1, text_encoder_2, optimizer, train_dataloader, lr_scheduler
    )

-    # For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
+    # For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
    # as these weights are only used for inference, keeping weights in full precision is not required.
    weight_dtype = torch.float32
    if accelerator.mixed_precision == "fp16":
@@ -1,352 +0,0 @@
-import argparse
-import pathlib
-from typing import Any, Dict
-
-import torch
-from accelerate import init_empty_weights
-from huggingface_hub import snapshot_download
-from transformers import T5EncoderModel, T5TokenizerFast
-
-from diffusers import AutoencoderKLCosmos, CosmosTextToWorldPipeline, CosmosTransformer3DModel, EDMEulerScheduler
-
-
-def remove_keys_(key: str, state_dict: Dict[str, Any]):
-    state_dict.pop(key)
-
-
-def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
-    state_dict[new_key] = state_dict.pop(old_key)
-
-
-def rename_transformer_blocks_(key: str, state_dict: Dict[str, Any]):
-    block_index = int(key.split(".")[1].removeprefix("block"))
-    new_key = key
-
-    old_prefix = f"blocks.block{block_index}"
-    new_prefix = f"transformer_blocks.{block_index}"
-    new_key = new_prefix + new_key.removeprefix(old_prefix)
-
-    state_dict[new_key] = state_dict.pop(key)
-
-
-TRANSFORMER_KEYS_RENAME_DICT = {
-    "t_embedder.1": "time_embed.t_embedder",
-    "affline_norm": "time_embed.norm",
-    ".blocks.0.block.attn": ".attn1",
-    ".blocks.1.block.attn": ".attn2",
-    ".blocks.2.block": ".ff",
-    ".blocks.0.adaLN_modulation.1": ".norm1.linear_1",
-    ".blocks.0.adaLN_modulation.2": ".norm1.linear_2",
-    ".blocks.1.adaLN_modulation.1": ".norm2.linear_1",
-    ".blocks.1.adaLN_modulation.2": ".norm2.linear_2",
-    ".blocks.2.adaLN_modulation.1": ".norm3.linear_1",
-    ".blocks.2.adaLN_modulation.2": ".norm3.linear_2",
-    "to_q.0": "to_q",
-    "to_q.1": "norm_q",
-    "to_k.0": "to_k",
-    "to_k.1": "norm_k",
-    "to_v.0": "to_v",
-    "layer1": "net.0.proj",
-    "layer2": "net.2",
-    "proj.1": "proj",
-    "x_embedder": "patch_embed",
-    "extra_pos_embedder": "learnable_pos_embed",
-    "final_layer.adaLN_modulation.1": "norm_out.linear_1",
-    "final_layer.adaLN_modulation.2": "norm_out.linear_2",
-    "final_layer.linear": "proj_out",
-}
-
-TRANSFORMER_SPECIAL_KEYS_REMAP = {
-    "blocks.block": rename_transformer_blocks_,
-    "logvar.0.freqs": remove_keys_,
-    "logvar.0.phases": remove_keys_,
-    "logvar.1.weight": remove_keys_,
-    "pos_embedder.seq": remove_keys_,
-}
-
-TRANSFORMER_CONFIGS = {
-    "Cosmos-1.0-Diffusion-7B-Text2World": {
-        "in_channels": 16,
-        "out_channels": 16,
-        "num_attention_heads": 32,
-        "attention_head_dim": 128,
-        "num_layers": 28,
-        "mlp_ratio": 4.0,
-        "text_embed_dim": 1024,
-        "adaln_lora_dim": 256,
-        "max_size": (128, 240, 240),
-        "patch_size": (1, 2, 2),
-        "rope_scale": (2.0, 1.0, 1.0),
-        "concat_padding_mask": True,
-        "extra_pos_embed_type": "learnable",
-    },
-    "Cosmos-1.0-Diffusion-7B-Video2World": {
-        "in_channels": 16 + 1,
-        "out_channels": 16,
-        "num_attention_heads": 32,
-        "attention_head_dim": 128,
-        "num_layers": 28,
-        "mlp_ratio": 4.0,
-        "text_embed_dim": 1024,
-        "adaln_lora_dim": 256,
-        "max_size": (128, 240, 240),
-        "patch_size": (1, 2, 2),
-        "rope_scale": (2.0, 1.0, 1.0),
-        "concat_padding_mask": True,
-        "extra_pos_embed_type": "learnable",
-    },
-    "Cosmos-1.0-Diffusion-14B-Text2World": {
-        "in_channels": 16,
-        "out_channels": 16,
-        "num_attention_heads": 40,
-        "attention_head_dim": 128,
-        "num_layers": 36,
-        "mlp_ratio": 4.0,
-        "text_embed_dim": 1024,
-        "adaln_lora_dim": 256,
-        "max_size": (128, 240, 240),
-        "patch_size": (1, 2, 2),
-        "rope_scale": (2.0, 2.0, 2.0),
-        "concat_padding_mask": True,
-        "extra_pos_embed_type": "learnable",
-    },
-    "Cosmos-1.0-Diffusion-14B-Video2World": {
-        "in_channels": 16 + 1,
-        "out_channels": 16,
-        "num_attention_heads": 40,
-        "attention_head_dim": 128,
-        "num_layers": 36,
-        "mlp_ratio": 4.0,
-        "text_embed_dim": 1024,
-        "adaln_lora_dim": 256,
-        "max_size": (128, 240, 240),
-        "patch_size": (1, 2, 2),
-        "rope_scale": (2.0, 2.0, 2.0),
-        "concat_padding_mask": True,
-        "extra_pos_embed_type": "learnable",
-    },
-}
-
-VAE_KEYS_RENAME_DICT = {
-    "down.0": "down_blocks.0",
-    "down.1": "down_blocks.1",
-    "down.2": "down_blocks.2",
-    "up.0": "up_blocks.2",
-    "up.1": "up_blocks.1",
-    "up.2": "up_blocks.0",
-    ".block.": ".resnets.",
-    "downsample": "downsamplers.0",
-    "upsample": "upsamplers.0",
-    "mid.block_1": "mid_block.resnets.0",
-    "mid.attn_1.0": "mid_block.attentions.0",
-    "mid.attn_1.1": "mid_block.temp_attentions.0",
-    "mid.block_2": "mid_block.resnets.1",
-    ".q.conv3d": ".to_q",
-    ".k.conv3d": ".to_k",
-    ".v.conv3d": ".to_v",
-    ".proj_out.conv3d": ".to_out.0",
-    ".0.conv3d": ".conv_s",
-    ".1.conv3d": ".conv_t",
-    "conv1.conv3d": "conv1",
-    "conv2.conv3d": "conv2",
-    "conv3.conv3d": "conv3",
-    "nin_shortcut.conv3d": "conv_shortcut",
-    "quant_conv.conv3d": "quant_conv",
-    "post_quant_conv.conv3d": "post_quant_conv",
-}
-
-VAE_SPECIAL_KEYS_REMAP = {
-    "wavelets": remove_keys_,
-    "_arange": remove_keys_,
-    "patch_size_buffer": remove_keys_,
-}
-
-VAE_CONFIGS = {
-    "CV8x8x8-0.1": {
-        "name": "nvidia/Cosmos-0.1-Tokenizer-CV8x8x8",
-        "diffusers_config": {
-            "in_channels": 3,
-            "out_channels": 3,
-            "latent_channels": 16,
-            "encoder_block_out_channels": (128, 256, 512, 512),
-            "decode_block_out_channels": (256, 512, 512, 512),
-            "attention_resolutions": (32,),
-            "resolution": 1024,
-            "num_layers": 2,
-            "patch_size": 4,
-            "patch_type": "haar",
-            "scaling_factor": 1.0,
-            "spatial_compression_ratio": 8,
-            "temporal_compression_ratio": 8,
-            "latents_mean": None,
-            "latents_std": None,
-        },
-    },
-    "CV8x8x8-1.0": {
-        "name": "nvidia/Cosmos-1.0-Tokenizer-CV8x8x8",
-        "diffusers_config": {
-            "in_channels": 3,
-            "out_channels": 3,
-            "latent_channels": 16,
-            "encoder_block_out_channels": (128, 256, 512, 512),
-            "decode_block_out_channels": (256, 512, 512, 512),
-            "attention_resolutions": (32,),
-            "resolution": 1024,
-            "num_layers": 2,
-            "patch_size": 4,
-            "patch_type": "haar",
-            "scaling_factor": 1.0,
-            "spatial_compression_ratio": 8,
-            "temporal_compression_ratio": 8,
-            "latents_mean": None,
-            "latents_std": None,
-        },
-    },
-}
-
-
-def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
-    state_dict = saved_dict
-    if "model" in saved_dict.keys():
-        state_dict = state_dict["model"]
-    if "module" in saved_dict.keys():
-        state_dict = state_dict["module"]
-    if "state_dict" in saved_dict.keys():
-        state_dict = state_dict["state_dict"]
-    return state_dict
-
-
-def convert_transformer(transformer_type: str, ckpt_path: str):
-    PREFIX_KEY = "net."
-    original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", weights_only=True))
-
-    with init_empty_weights():
-        config = TRANSFORMER_CONFIGS[transformer_type]
-        transformer = CosmosTransformer3DModel(**config)
-
-    for key in list(original_state_dict.keys()):
-        new_key = key[:]
-        if new_key.startswith(PREFIX_KEY):
-            new_key = new_key.removeprefix(PREFIX_KEY)
-        for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
-            new_key = new_key.replace(replace_key, rename_key)
-        update_state_dict_(original_state_dict, key, new_key)
-
-    for key in list(original_state_dict.keys()):
-        for special_key, handler_fn_inplace in TRANSFORMER_SPECIAL_KEYS_REMAP.items():
-            if special_key not in key:
-                continue
-            handler_fn_inplace(key, original_state_dict)
-
-    transformer.load_state_dict(original_state_dict, strict=True, assign=True)
-    return transformer
-
-
-def convert_vae(vae_type: str):
-    model_name = VAE_CONFIGS[vae_type]["name"]
-    snapshot_directory = snapshot_download(model_name, repo_type="model")
-    directory = pathlib.Path(snapshot_directory)
-
-    autoencoder_file = directory / "autoencoder.jit"
-    mean_std_file = directory / "mean_std.pt"
-
-    original_state_dict = torch.jit.load(autoencoder_file.as_posix()).state_dict()
-    if mean_std_file.exists():
-        mean_std = torch.load(mean_std_file, map_location="cpu", weights_only=True)
-    else:
-        mean_std = (None, None)
-
-    config = VAE_CONFIGS[vae_type]["diffusers_config"]
-    config.update(
-        {
-            "latents_mean": mean_std[0].detach().cpu().numpy().tolist(),
-            "latents_std": mean_std[1].detach().cpu().numpy().tolist(),
-        }
-    )
-    vae = AutoencoderKLCosmos(**config)
-
-    for key in list(original_state_dict.keys()):
-        new_key = key[:]
-        for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
-            new_key = new_key.replace(replace_key, rename_key)
-        update_state_dict_(original_state_dict, key, new_key)
-
-    for key in list(original_state_dict.keys()):
-        for special_key, handler_fn_inplace in VAE_SPECIAL_KEYS_REMAP.items():
-            if special_key not in key:
-                continue
-            handler_fn_inplace(key, original_state_dict)
-
-    vae.load_state_dict(original_state_dict, strict=True, assign=True)
-    return vae
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--transformer_type", type=str, default=None, choices=list(TRANSFORMER_CONFIGS.keys()))
-    parser.add_argument(
-        "--transformer_ckpt_path", type=str, default=None, help="Path to original transformer checkpoint"
-    )
-    parser.add_argument("--vae_type", type=str, default=None, choices=list(VAE_CONFIGS.keys()), help="Type of VAE")
-    parser.add_argument("--text_encoder_path", type=str, default="google-t5/t5-11b")
-    parser.add_argument("--tokenizer_path", type=str, default="google-t5/t5-11b")
-    parser.add_argument("--save_pipeline", action="store_true")
-    parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
-    parser.add_argument("--dtype", default="bf16", help="Torch dtype to save the transformer in.")
-    return parser.parse_args()
-
-
-DTYPE_MAPPING = {
-    "fp32": torch.float32,
-    "fp16": torch.float16,
-    "bf16": torch.bfloat16,
-}
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    transformer = None
-    dtype = DTYPE_MAPPING[args.dtype]
-
-    if args.save_pipeline:
-        assert args.transformer_ckpt_path is not None
-        assert args.vae_type is not None
-        assert args.text_encoder_path is not None
-        assert args.tokenizer_path is not None
-
-    if args.transformer_ckpt_path is not None:
-        transformer = convert_transformer(args.transformer_type, args.transformer_ckpt_path)
-        transformer = transformer.to(dtype=dtype)
-        if not args.save_pipeline:
-            transformer.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
-
-    if args.vae_type is not None:
-        vae = convert_vae(args.vae_type)
-        if not args.save_pipeline:
-            vae.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
-
-    if args.save_pipeline:
-        text_encoder = T5EncoderModel.from_pretrained(args.text_encoder_path, torch_dtype=dtype)
-        tokenizer = T5TokenizerFast.from_pretrained(args.tokenizer_path)
-        # The original code initializes EDM config with sigma_min=0.0002, but does not make use of it anywhere directly.
-        # So, the sigma_min values that is used is the default value of 0.002.
-        scheduler = EDMEulerScheduler(
-            sigma_min=0.002,
-            sigma_max=80,
-            sigma_data=0.5,
-            sigma_schedule="karras",
-            num_train_timesteps=1000,
-            prediction_type="epsilon",
-            rho=7.0,
-            final_sigmas_type="sigma_min",
-        )
-
-        pipe = CosmosTextToWorldPipeline(
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            transformer=transformer,
-            vae=vae,
-            scheduler=scheduler,
-        )
-        pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
@@ -220,7 +220,7 @@ def convert_flux_transformer_checkpoint_to_diffusers(
            f"double_blocks.{i}.txt_attn.proj.bias"
        )

-    # single transformer blocks
+    # single transfomer blocks
    for i in range(num_single_layers):
        block_prefix = f"single_transformer_blocks.{i}."
        # norm.linear  <- single_blocks.0.modulation.lin
@@ -7,15 +7,7 @@ from accelerate import init_empty_weights
 from safetensors.torch import load_file
 from transformers import T5EncoderModel, T5Tokenizer

-from diffusers import (
-    AutoencoderKLLTXVideo,
-    FlowMatchEulerDiscreteScheduler,
-    LTXConditionPipeline,
-    LTXLatentUpsamplePipeline,
-    LTXPipeline,
-    LTXVideoTransformer3DModel,
-)
-from diffusers.pipelines.ltx.modeling_latent_upsampler import LTXLatentUpsamplerModel
+from diffusers import AutoencoderKLLTXVideo, FlowMatchEulerDiscreteScheduler, LTXPipeline, LTXVideoTransformer3DModel


 def remove_keys_(key: str, state_dict: Dict[str, Any]):
@@ -131,10 +123,17 @@ def update_state_dict_inplace(state_dict: Dict[str, Any], old_key: str, new_key:
    state_dict[new_key] = state_dict.pop(old_key)


-def convert_transformer(ckpt_path: str, config, dtype: torch.dtype):
+def convert_transformer(
+    ckpt_path: str,
+    dtype: torch.dtype,
+    version: str = "0.9.0",
+):
    PREFIX_KEY = "model.diffusion_model."

    original_state_dict = get_state_dict(load_file(ckpt_path))
+    config = {}
+    if version == "0.9.5":
+        config["_use_causal_rope_fix"] = True
    with init_empty_weights():
        transformer = LTXVideoTransformer3DModel(**config)

@@ -181,59 +180,8 @@ def convert_vae(ckpt_path: str, config, dtype: torch.dtype):
    return vae


-def convert_spatial_latent_upsampler(ckpt_path: str, config, dtype: torch.dtype):
-    original_state_dict = get_state_dict(load_file(ckpt_path))
-
-    with init_empty_weights():
-        latent_upsampler = LTXLatentUpsamplerModel(**config)
-
-    latent_upsampler.load_state_dict(original_state_dict, strict=True, assign=True)
-    latent_upsampler.to(dtype)
-    return latent_upsampler
-
-
-def get_transformer_config(version: str) -> Dict[str, Any]:
-    if version == "0.9.7":
-        config = {
-            "in_channels": 128,
-            "out_channels": 128,
-            "patch_size": 1,
-            "patch_size_t": 1,
-            "num_attention_heads": 32,
-            "attention_head_dim": 128,
-            "cross_attention_dim": 4096,
-            "num_layers": 48,
-            "activation_fn": "gelu-approximate",
-            "qk_norm": "rms_norm_across_heads",
-            "norm_elementwise_affine": False,
-            "norm_eps": 1e-6,
-            "caption_channels": 4096,
-            "attention_bias": True,
-            "attention_out_bias": True,
-        }
-    else:
-        config = {
-            "in_channels": 128,
-            "out_channels": 128,
-            "patch_size": 1,
-            "patch_size_t": 1,
-            "num_attention_heads": 32,
-            "attention_head_dim": 64,
-            "cross_attention_dim": 2048,
-            "num_layers": 28,
-            "activation_fn": "gelu-approximate",
-            "qk_norm": "rms_norm_across_heads",
-            "norm_elementwise_affine": False,
-            "norm_eps": 1e-6,
-            "caption_channels": 4096,
-            "attention_bias": True,
-            "attention_out_bias": True,
-        }
-    return config
-
-
 def get_vae_config(version: str) -> Dict[str, Any]:
-    if version in ["0.9.0"]:
+    if version == "0.9.0":
        config = {
            "in_channels": 3,
            "out_channels": 3,
@@ -262,7 +210,7 @@ def get_vae_config(version: str) -> Dict[str, Any]:
            "decoder_causal": False,
            "timestep_conditioning": False,
        }
-    elif version in ["0.9.1"]:
+    elif version == "0.9.1":
        config = {
            "in_channels": 3,
            "out_channels": 3,
@@ -292,7 +240,7 @@ def get_vae_config(version: str) -> Dict[str, Any]:
            "decoder_causal": False,
        }
        VAE_KEYS_RENAME_DICT.update(VAE_091_RENAME_DICT)
-    elif version in ["0.9.5"]:
+    elif version == "0.9.5":
        config = {
            "in_channels": 3,
            "out_channels": 3,
@@ -324,53 +272,6 @@ def get_vae_config(version: str) -> Dict[str, Any]:
            "temporal_compression_ratio": 8,
        }
        VAE_KEYS_RENAME_DICT.update(VAE_095_RENAME_DICT)
-    elif version in ["0.9.7"]:
-        config = {
-            "in_channels": 3,
-            "out_channels": 3,
-            "latent_channels": 128,
-            "block_out_channels": (128, 256, 512, 1024, 2048),
-            "down_block_types": (
-                "LTXVideo095DownBlock3D",
-                "LTXVideo095DownBlock3D",
-                "LTXVideo095DownBlock3D",
-                "LTXVideo095DownBlock3D",
-            ),
-            "decoder_block_out_channels": (256, 512, 1024),
-            "layers_per_block": (4, 6, 6, 2, 2),
-            "decoder_layers_per_block": (5, 5, 5, 5),
-            "spatio_temporal_scaling": (True, True, True, True),
-            "decoder_spatio_temporal_scaling": (True, True, True),
-            "decoder_inject_noise": (False, False, False, False),
-            "downsample_type": ("spatial", "temporal", "spatiotemporal", "spatiotemporal"),
-            "upsample_residual": (True, True, True),
-            "upsample_factor": (2, 2, 2),
-            "timestep_conditioning": True,
-            "patch_size": 4,
-            "patch_size_t": 1,
-            "resnet_norm_eps": 1e-6,
-            "scaling_factor": 1.0,
-            "encoder_causal": True,
-            "decoder_causal": False,
-            "spatial_compression_ratio": 32,
-            "temporal_compression_ratio": 8,
-        }
-        VAE_KEYS_RENAME_DICT.update(VAE_095_RENAME_DICT)
-    return config
-
-
-def get_spatial_latent_upsampler_config(version: str) -> Dict[str, Any]:
-    if version == "0.9.7":
-        config = {
-            "in_channels": 128,
-            "mid_channels": 512,
-            "num_blocks_per_stage": 4,
-            "dims": 3,
-            "spatial_upsample": True,
-            "temporal_upsample": False,
-        }
-    else:
-        raise ValueError(f"Unsupported version: {version}")
    return config


@@ -380,12 +281,6 @@ def get_args():
        "--transformer_ckpt_path", type=str, default=None, help="Path to original transformer checkpoint"
    )
    parser.add_argument("--vae_ckpt_path", type=str, default=None, help="Path to original vae checkpoint")
-    parser.add_argument(
-        "--spatial_latent_upsampler_path",
-        type=str,
-        default=None,
-        help="Path to original spatial latent upsampler checkpoint",
-    )
    parser.add_argument(
        "--text_encoder_cache_dir", type=str, default=None, help="Path to text encoder cache directory"
    )
@@ -399,11 +294,7 @@ def get_args():
    parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
    parser.add_argument("--dtype", default="fp32", help="Torch dtype to save the model in.")
    parser.add_argument(
-        "--version",
-        type=str,
-        default="0.9.0",
-        choices=["0.9.0", "0.9.1", "0.9.5", "0.9.7"],
-        help="Version of the LTX model",
+        "--version", type=str, default="0.9.0", choices=["0.9.0", "0.9.1", "0.9.5"], help="Version of the LTX model"
    )
    return parser.parse_args()

@@ -429,9 +320,11 @@ if __name__ == "__main__":
    variant = VARIANT_MAPPING[args.dtype]
    output_path = Path(args.output_path)

+    if args.save_pipeline:
+        assert args.transformer_ckpt_path is not None and args.vae_ckpt_path is not None
+
    if args.transformer_ckpt_path is not None:
-        config = get_transformer_config(args.version)
-        transformer: LTXVideoTransformer3DModel = convert_transformer(args.transformer_ckpt_path, config, dtype)
+        transformer: LTXVideoTransformer3DModel = convert_transformer(args.transformer_ckpt_path, dtype)
        if not args.save_pipeline:
            transformer.save_pretrained(
                output_path / "transformer", safe_serialization=True, max_shard_size="5GB", variant=variant
@@ -443,16 +336,6 @@ if __name__ == "__main__":
        if not args.save_pipeline:
            vae.save_pretrained(output_path / "vae", safe_serialization=True, max_shard_size="5GB", variant=variant)

-    if args.spatial_latent_upsampler_path is not None:
-        config = get_spatial_latent_upsampler_config(args.version)
-        latent_upsampler: LTXLatentUpsamplerModel = convert_spatial_latent_upsampler(
-            args.spatial_latent_upsampler_path, config, dtype
-        )
-        if not args.save_pipeline:
-            latent_upsampler.save_pretrained(
-                output_path / "latent_upsampler", safe_serialization=True, max_shard_size="5GB", variant=variant
-            )
-
    if args.save_pipeline:
        text_encoder_id = "google/t5-v1_1-xxl"
        tokenizer = T5Tokenizer.from_pretrained(text_encoder_id, model_max_length=TOKENIZER_MAX_LENGTH)
@@ -465,7 +348,7 @@ if __name__ == "__main__":
        for param in text_encoder.parameters():
            param.data = param.data.contiguous()

-        if args.version in ["0.9.5", "0.9.7"]:
+        if args.version == "0.9.5":
            scheduler = FlowMatchEulerDiscreteScheduler(use_dynamic_shifting=False)
        else:
            scheduler = FlowMatchEulerDiscreteScheduler(
@@ -477,40 +360,12 @@ if __name__ == "__main__":
                shift_terminal=0.1,
            )

-        if args.version in ["0.9.0", "0.9.1", "0.9.5"]:
-            pipe = LTXPipeline(
-                scheduler=scheduler,
-                vae=vae,
-                text_encoder=text_encoder,
-                tokenizer=tokenizer,
-                transformer=transformer,
-            )
-            pipe.save_pretrained(
-                output_path.as_posix(), safe_serialization=True, variant=variant, max_shard_size="5GB"
-            )
-        elif args.version in ["0.9.7"]:
-            pipe = LTXConditionPipeline(
-                scheduler=scheduler,
-                vae=vae,
-                text_encoder=text_encoder,
-                tokenizer=tokenizer,
-                transformer=transformer,
-            )
-            pipe_upsample = LTXLatentUpsamplePipeline(
-                vae=vae,
-                latent_upsampler=latent_upsampler,
-            )
-            pipe.save_pretrained(
-                (output_path / "ltx_pipeline").as_posix(),
-                safe_serialization=True,
-                variant=variant,
-                max_shard_size="5GB",
-            )
-            pipe_upsample.save_pretrained(
-                (output_path / "ltx_upsample_pipeline").as_posix(),
-                safe_serialization=True,
-                variant=variant,
-                max_shard_size="5GB",
-            )
-        else:
-            raise ValueError(f"Unsupported version: {args.version}")
+        pipe = LTXPipeline(
+            scheduler=scheduler,
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+        )
+
+        pipe.save_pretrained(args.output_path, safe_serialization=True, variant=variant, max_shard_size="5GB")
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
yiyixuxu	d34c4e8caf	update the description of StableDiffusionXLDenoiseLoopWrapper	2025-06-20 07:38:21 +02:00
yiyixuxu	b46b7c8b31	add to method to modular loader, copied from DiffusionPipeline, not tested yet	2025-06-20 07:25:20 +02:00
yiyixuxu	fc9168f429	add block mappings to modular_diffusers.stable_diffusion_xl.__init__	2025-06-20 07:24:14 +02:00
yiyixuxu	31a31ca1c5	rename modular_pipeline_block_mappings.py to modular_block_mapping	2025-06-20 07:23:14 +02:00
yiyixuxu	8423652b35	updatee modular_pipeline.from_pretrained, modular_repo ->pretrained_model_name_or_path	2025-06-19 05:30:18 +02:00
yiyixuxu	de631947cc	up	2025-06-19 04:45:20 +02:00
yiyixuxu	58e9565719	update doc format for kwargs_type	2025-06-19 02:24:51 +02:00
yiyixuxu	cb6d5fed19	refator based on dhruv's feedbacks	2025-06-18 10:11:22 +02:00
yiyixuxu	f16e9c7807	add	2025-06-10 23:10:17 +02:00
yiyixuxu	87f63d424a	modular node!	2025-05-22 11:50:36 +02:00
yiyixuxu	29de29f02c	add node_utils	2025-05-21 22:31:10 +02:00
yiyixuxu	72e1b74638	solve merge conflict: manually add back the remote code change to modular_pipeline	2025-05-20 20:26:51 +02:00
yiyixuxu	3471f2fb75	merge part1	2025-05-20 18:53:04 +02:00
yiyixuxu	d136ae36c8	update input for loop blocks, do not need to include intermediate	2025-05-20 18:11:05 +02:00
yiyixuxu	1b89ac144c	prepare_latents_img2img pipeline method -> function, maybe do the same for others?	2025-05-20 18:10:06 +02:00
yiyixuxu	eb9415031a	add a to-do for modular loader	2025-05-20 18:08:28 +02:00
yiyixuxu	de6ab6b49d	fix import in block mapping	2025-05-20 18:07:58 +02:00
yiyixuxu	4968edc5dc	remove the duplicated components_manager file I forgot to deletee	2025-05-20 18:07:27 +02:00
Dhruv Nair	808dff09cb	[WIP] Modular Diffusers support custom code/pipeline blocks (#11539 ) * update * update	2025-05-20 15:12:51 +05:30
yiyixuxu	61dac3bbe4	up	2025-05-19 22:39:32 +02:00
yiyixuxu	73ab5725c2	update components manager	2025-05-18 19:09:01 +02:00
yiyixuxu	163341d3dd	refactor modular loader: 1. load only load (pretrained components only if not specific names) 2. update acceept create spec 3. move the updte _componeent_spec logic outside register_components to each method that create/update the component: __init__/update/load	2025-05-18 18:58:26 +02:00
yiyixuxu	d0fbf745e6	refactor component spec: replace create/create_from_pretrained/create_from_config to just create and load method	2025-05-18 18:52:12 +02:00
yiyixuxu	27c1158b23	add a to-do for guider cconfig mixin	2025-05-18 18:50:03 +02:00
yiyixuxu	96ce6744fe	after_denoise -> decoders	2025-05-15 00:45:45 +02:00
yiyixuxu	8ad14a52cb	make generator intermediates (it is mutable)	2025-05-13 23:25:56 +02:00
yiyixuxu	a7fb2d2a22	remove the output step	2025-05-13 22:15:54 +02:00
yiyixuxu	a0deefb606	fix more	2025-05-13 20:51:21 +02:00
yiyixuxu	e2491af650	fix import	2025-05-13 20:42:57 +02:00
yiyixuxu	506a8ea09c	fix imports	2025-05-13 04:36:06 +02:00
yiyixuxu	58358c2d00	decode block, if skip decoding do not need to update latent	2025-05-13 01:57:47 +02:00
yiyixuxu	5cde77f915	make inputs truly immutable, remove the output logic in sequential pipeline, and update so that intermediates_outputs are only new variables	2025-05-13 01:52:51 +02:00
yiyixuxu	522e827625	move block mappings to its own file	2025-05-12 01:17:45 +02:00
yiyixuxu	144eae4e0b	add block state will also make sure modifed intermediates_inputs will be updated	2025-05-12 01:16:42 +02:00
yiyixuxu	796453cad1	add notes	2025-05-12 01:14:43 +02:00
yiyixuxu	153ae34ff6	update __init__	2025-05-10 03:50:47 +02:00
yiyixuxu	0acb5e1460	made a modular_pipelines folder!	2025-05-10 03:50:31 +02:00
yiyixuxu	462429b687	remove modular reelated change from pipelines folder	2025-05-10 03:50:10 +02:00
yiyixuxu	cf01aaeb49	update imports on guiders	2025-05-10 03:49:30 +02:00
yiyixuxu	2017ae5624	fix auto denoise so all tests pass	2025-05-09 08:19:24 +02:00
yiyixuxu	2b361a2413	fix get_execusion blocks with loopsequential	2025-05-09 08:17:10 +02:00
yiyixuxu	c677d528e4	change warning to debug	2025-05-09 08:16:24 +02:00
yiyixuxu	0f0618ff2b	refactor the denoiseestep using LoopSequential! also add a new file for denoise step	2025-05-08 11:28:52 +02:00
yiyixuxu	d89631fc50	update input formating, consider kwarggs_type inputs with no name, e/g *_controlnet_kwargs	2025-05-08 11:27:17 +02:00
yiyixuxu	16b6583fa8	allow input_fields as input & update message	2025-05-08 11:25:31 +02:00
yiyixuxu	f552773572	remove controlnet union denoise step, refactor & reuse controlnet denoisee step to accept aditional contrlnet kwargs	2025-05-06 10:00:14 +02:00
yiyixuxu	dc4dbfe107	reefactor pipeline/block states so that it can dynamically accept kwargs	2025-05-06 09:58:44 +02:00
yiyixuxu	43ac1ff7e7	refactor controlnet union	2025-05-04 22:17:25 +02:00
yiyixuxu	efd70b7838	seperate controlnet step into input + denoise	2025-05-03 20:22:05 +02:00
yiyixuxu	7ca860c24b	rename pipeline -> components, data -> block_state	2025-05-03 01:32:59 +02:00
yiyixuxu	7b86fcea31	remove lora step and ip-adapter step -> no longer needed	2025-05-02 11:31:25 +02:00
yiyixuxu	c8b5d56412	make loader optional	2025-05-02 00:46:31 +02:00
YiYi Xu	ce642e92da	Merge branch 'modular-diffusers' into modular-refactor	2025-04-30 17:56:51 -10:00
YiYi Xu	6a509ba862	Merge branch 'main' into modular-diffusers	2025-04-30 17:56:25 -10:00
YiYi Xu	6d5beefe29	[modular diffusers] introducing ModularLoader (#11462 ) * cfg; slg; pag; sdxl without controlnet --------- Co-authored-by: Aryan <aryan@huggingface.co>	2025-04-30 11:17:20 -10:00
Aryan	b863bdd6ca	Modular Diffusers Guiders (#11311 ) * cfg; slg; pag; sdxl without controlnet * support sdxl controlnet * support controlnet union * update * update * cfg zero* * use unwrap_module for torch compiled modules * remove guider kwargs * remove commented code * remove old guider * fix slg bug * remove debug print * autoguidance * smoothed energy guidance * add note about seg * tangential cfg * cfg plus plus * support cfgpp in ddim * apply review suggestions * refactor * rename enable/disable * remove cfg++ for now * rename do_classifier_free_guidance->prepare_unconditional_embeds * remove unused	2025-04-26 03:42:42 +05:30
yiyixuxu	d143851309	move methods to blocks	2025-04-12 11:46:25 +02:00
yiyixuxu	9ad1470d48	up	2025-04-11 18:29:21 +02:00
yiyixuxu	bf99ab2f55	up	2025-04-09 20:36:45 +02:00
yiyixuxu	ee842839ef	add componentspec and configspec	2025-04-09 01:40:02 +02:00
YiYi Xu	96795afc72	Merge branch 'main' into modular-diffusers	2025-04-07 18:05:00 -10:00
yiyixuxu	12650e1393	up	2025-02-04 02:08:28 +01:00
yiyixuxu	addaad013c	more more more refactor	2025-02-03 20:36:05 +01:00
yiyixuxu	485f8d1758	more refactor	2025-02-01 21:30:05 +01:00
yiyixuxu	cff0fd6260	more refactor	2025-02-01 11:36:13 +01:00
yiyixuxu	8ddb20bfb8	up	2025-02-01 05:45:00 +01:00
yiyixuxu	e5089d702b	update	2025-01-31 21:55:45 +01:00
yiyixuxu	2c3e4eafa8	fix	2025-01-29 17:58:40 +01:00
yiyixuxu	c7020df2cf	add model_info	2025-01-27 11:33:27 +01:00
yiyixuxu	4bed3e306e	up up	2025-01-26 13:04:33 +01:00
yiyixuxu	00a3bc9d6c	fix	2025-01-23 18:16:00 +01:00
YiYi Xu	ccb35acd81	Merge branch 'main' into modular-diffusers	2025-01-23 07:07:11 -10:00
yiyixuxu	00cae4e857	docstring doc doc doc	2025-01-23 11:07:13 +01:00
yiyixuxu	b3fb4188f5	Merge branch 'modular-diffusers' of github.com:huggingface/diffusers into modular-diffusers	2025-01-22 17:24:06 +01:00
YiYi Xu	71df1581f7	Update src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_modular.py Co-authored-by: Álvaro Somoza <asomoza@users.noreply.github.com>	2025-01-22 06:19:22 -10:00
yiyixuxu	d046cf7d35	block state + fix for num_images_per_prompt > 1 for denoise/controlnet union etc	2025-01-22 09:48:57 +01:00
yiyixuxu	68a5185c86	refactor more, ipadapter node, lora node	2025-01-20 03:36:01 +01:00
yiyixuxu	6e2fe26bfd	fix more for lora	2025-01-18 08:04:12 +01:00
yiyixuxu	77b5fa59c5	make it work with lora has both text_encoder & unet	2025-01-18 04:12:07 +01:00
yiyixuxu	a226920b52	get_block_state make it less verbose	2025-01-17 01:37:18 +01:00
yiyixuxu	7007f72409	InputParam, OutputParam, get_auto_doc	2025-01-16 11:44:24 +01:00
yiyixuxu	a6804de4a2	add controlnet union to auto & fix for pag	2025-01-12 16:24:01 +01:00
yiyixuxu	7f897a9fc4	fix	2025-01-12 04:50:45 +01:00
yiyixuxu	0966663d2a	adjust print	2025-01-11 19:15:54 +01:00
yiyixuxu	fb78f4f12d	Merge branch 'modular-diffusers' of github.com:huggingface/diffusers into modular-diffusers	2025-01-11 09:05:56 +01:00
yiyixuxu	2220af6940	refactor	2025-01-11 09:05:47 +01:00
hlky	7a34832d52	[modular] Stable Diffusion XL ControlNet Union (#10509 ) StableDiffusionXLControlNetUnionDenoiseStep	2025-01-09 10:29:45 -10:00
yiyixuxu	e973de64f9	fix contro;net inpaint preprocess	2025-01-08 21:47:20 +01:00
yiyixuxu	db94ca882d	add controlnet inpaint + more refactor	2025-01-07 20:49:58 +01:00
yiyixuxu	6985906a2e	controlnet input & remove the MultiPipelineBlocks class	2025-01-07 01:56:33 +01:00
yiyixuxu	54f410db6c	add inpaint	2025-01-06 09:19:59 +01:00
yiyixuxu	c12a05b9c1	update to to not assume pipeline has hf_device_map	2025-01-03 20:57:44 +01:00
yiyixuxu	2e0f5c86cc	start to add inpaint	2025-01-03 18:20:39 +01:00
yiyixuxu	1d63306295	make it work with lora	2025-01-03 06:07:25 +01:00
yiyixuxu	6c93626f6f	remove run_blocks, just use __call__	2025-01-02 00:59:12 +01:00
yiyixuxu	72c5bf07c8	add a from_block class method to modular pipeline	2025-01-02 00:49:34 +01:00
yiyixuxu	ed59f90f15	modular pipeline builder -> ModularPipeline	2025-01-01 22:15:48 +01:00
yiyixuxu	a09ca7f27e	refactors: block __init__ no longer accept args. remove update_states from pipeline blocks, add update_states to modularpipeline, remove multi-block support for modular pipeline, remove offload support on modular pipeline	2025-01-01 21:43:20 +01:00
yiyixuxu	8c02572e16	add memory_reserve_margin arg to auto offload	2024-12-31 20:08:53 +01:00
yiyixuxu	27dde51de8	add output arg to run_blocks	2024-12-31 18:06:44 +01:00
yiyixuxu	10d4a775f1	style	2024-12-31 09:55:50 +01:00
yiyixuxu	72d9a81d99	components manager	2024-12-31 09:54:46 +01:00
yiyixuxu	4fa85c7963	add model_manager and global offloading method	2024-12-31 02:57:42 +01:00
YiYi Xu	806e8e66fb	Merge branch 'main' into modular-diffusers	2024-12-29 00:44:43 -10:00
yiyixuxu	0b90051db8	add vae encoder node	2024-12-19 17:57:12 +01:00
yiyixuxu	b305c779b2	add offload support!	2024-12-14 21:37:21 +01:00
yiyixuxu	2b3cd2d39c	update	2024-12-14 03:02:31 +01:00
yiyixuxu	bc3d1c9ee6	add model_cpu_offload_seq + _exlude_from_cpu_offload	2024-12-14 00:24:15 +01:00
yiyixuxu	e50d614636	only add model as expected_component when the model need to run for the block, currently it's added even when only config is needed	2024-12-11 03:39:39 +01:00
hlky	a8df0f1ffb	Modular APG (#10173 )	2024-12-10 08:22:42 -10:00
yiyixuxu	ace53e2d2f	update/refactor	2024-12-10 03:41:28 +01:00
yiyixuxu	ffc2992fc2	add autostep (not complete)	2024-11-16 22:42:06 +01:00
yiyixuxu	c70a285c2c	style	2024-10-30 10:33:25 +01:00
yiyixuxu	8b811feece	refactor, from_pretrained, from_pipe, remove_blocks, replace_blocks	2024-10-30 10:13:03 +01:00
yiyixuxu	37e8dc7a59	remove img2img blocksgit status consolidate text2img and img2img	2024-10-28 00:37:48 +01:00
yiyixuxu	024a9f5de3	fix so that run_blocks can work with inputs in the state	2024-10-27 18:52:56 +01:00
yiyixuxu	005195c23e	add	2024-10-27 15:18:10 +01:00
yiyixuxu	6742f160df	up	2024-10-27 14:59:31 +01:00
yiyixuxu	540d303250	refactor guider	2024-10-26 21:17:06 +02:00
yiyixuxu	f1b3036ca1	update pag guider - draft	2024-10-24 00:14:59 +02:00
yiyixuxu	46ec1743a2	refactor guider, remove prepareguidance step to be combinedd into denoisestep	2024-10-23 21:42:40 +02:00
yiyixuxu	70272b1108	combine controlnetstep into contronetdesnoisestep	2024-10-20 19:45:00 +02:00
yiyixuxu	2b6dcbfa1d	fix controlnet	2024-10-20 19:23:37 +02:00
yiyixuxu	af9572d759	controlnet	2024-10-19 12:36:12 +02:00
yiyixuxu	ddea157979	add from_pipe + run_blocks	2024-10-17 20:02:36 +02:00
yiyixuxu	ad3f9a26c0	update img2img, result match	2024-10-17 05:47:15 +02:00
yiyixuxu	e8d0980f9f	add img2img support - output does not match with non-modular pipeline completely yet (look into later)	2024-10-16 20:56:39 +02:00
yiyixuxu	52a7f1cb97	add dataflow info for each block in builder _repr_	2024-10-16 09:04:32 +02:00
yiyixuxu	33f85fadf6	add	2024-10-14 19:16:23 +02:00