resolve conflicts

Revert "merge main"
This reverts commit 65efbcead5.
2025-08-28 15:12:03 +05:30 · 2025-08-28 15:02:28 +05:30 · 2025-08-28 14:56:46 +05:30 · 2025-08-26 10:50:02 +05:30 · 2025-08-26 10:37:30 +05:30 · 2025-08-26 09:39:50 +05:30
244 changed files with 5310 additions and 16603 deletions
@@ -340,9 +340,6 @@ jobs:
          - backend: "optimum_quanto"
            test_location: "quanto"
            additional_deps: []
-          - backend: "nvidia_modelopt"
-            test_location: "modelopt"
-            additional_deps: []
    runs-on:
      group: aws-g6e-xlarge-plus
    container:
@@ -110,9 +110,8 @@ jobs:
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
-        # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
-        # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-        # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+        pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps

    - name: Environment
      run: |
@@ -116,9 +116,8 @@ jobs:
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
-        # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
-        # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-        # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+        pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps

    - name: Environment
      run: |
@@ -254,10 +253,9 @@ jobs:
        python -m uv pip install -e [quality,test]
        # TODO (sayakpaul, DN6): revisit `--no-deps`
        python -m pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
-        # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
-        # python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-        # python -m uv pip install -U tokenizers
-        # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+        python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+        python -m uv pip install -U tokenizers
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps

    - name: Environment
      run: |
@@ -132,9 +132,8 @@ jobs:
        run: |
          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
          python -m uv pip install -e [quality,test]
-          # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
-          # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-          # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+          pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps

      - name: Environment
        run: |
@@ -204,9 +203,8 @@ jobs:
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
        python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-        # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
-        # pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-        # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
+        pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps

    - name: Environment
      run: |
@@ -268,8 +266,7 @@ jobs:
    - name: Install dependencies
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        # Stopping this update temporarily until the Hub RC is fully shipped and integrated.
-        # pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+        pip uninstall transformers -y && python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
        python -m uv pip install -e [quality,test,training]

    - name: Environment
@@ -23,11 +23,13 @@
  - local: using-diffusers/reusing_seeds
    title: Reproducibility
  - local: using-diffusers/schedulers
-    title: Schedulers
+    title: Load schedulers and models
+  - local: using-diffusers/scheduler_features
+    title: Scheduler features
  - local: using-diffusers/other-formats
    title: Model files and layouts
  - local: using-diffusers/push_to_hub
-    title: Sharing pipelines and models
+    title: Push files to the Hub

 - title: Adapters
  isExpanded: false
@@ -56,6 +58,12 @@
    title: Batch inference
  - local: training/distributed_inference
    title: Distributed inference
+  - local: using-diffusers/scheduler_features
+    title: Scheduler features
+  - local: using-diffusers/callback
+    title: Pipeline callbacks
+  - local: using-diffusers/image_quality
+    title: Controlling image quality

 - title: Inference optimization
  isExpanded: false
@@ -64,14 +72,10 @@
    title: Accelerate inference
  - local: optimization/cache
    title: Caching
-  - local: optimization/attention_backends
-    title: Attention backends
  - local: optimization/memory
    title: Reduce memory usage
  - local: optimization/speed-memory-optims
    title: Compiling and offloading quantized models
-  - local: api/parallel
-    title: Parallel inference
  - title: Community optimizations
    sections:
    - local: optimization/pruna
@@ -82,16 +86,12 @@
      title: Token merging
    - local: optimization/deepcache
      title: DeepCache
-    - local: optimization/cache_dit
-      title: CacheDiT
    - local: optimization/tgate
      title: TGATE
    - local: optimization/xdit
      title: xDiT
    - local: optimization/para_attn
      title: ParaAttention
-    - local: using-diffusers/image_quality
-      title: FreeU

 - title: Hybrid Inference
  isExpanded: false
@@ -188,8 +188,6 @@
    title: torchao
  - local: quantization/quanto
    title: quanto
-  - local: quantization/modelopt
-    title: NVIDIA ModelOpt

 - title: Model accelerators and hardware
  isExpanded: false
@@ -20,12 +20,6 @@ All pipelines with [`VaeImageProcessor`] accept PIL Image, PyTorch tensor, or Nu

 [[autodoc]] image_processor.VaeImageProcessor

-## InpaintProcessor
-
-The [`InpaintProcessor`] accepts `mask` and `image` inputs and process them together. Optionally, it can accept padding_mask_crop and apply mask overlay.
-
-[[autodoc]] image_processor.InpaintProcessor
-
 ## VaeImageProcessorLDM3D

 The [`VaeImageProcessorLDM3D`] accepts RGB and depth inputs and returns RGB and depth outputs.
@@ -42,4 +42,4 @@ pipe = FluxControlNetPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", co

 ## FluxControlNetOutput

-[[autodoc]] models.controlnets.controlnet_flux.FluxControlNetOutput
+[[autodoc]] models.controlnet_flux.FluxControlNetOutput
@@ -43,4 +43,4 @@ controlnet = SparseControlNetModel.from_pretrained("guoyww/animatediff-sparsectr

 ## SparseControlNetOutput

-[[autodoc]] models.controlnets.controlnet_sparsectrl.SparseControlNetOutput
+[[autodoc]] models.controlnet_sparsectrl.SparseControlNetOutput
@@ -1,24 +0,0 @@
-<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# Parallelism
-
-Parallelism strategies help speed up diffusion transformers by distributing computations across multiple devices, allowing for faster inference/training times.
-
-## ParallelConfig
-
-[[autodoc]] ParallelConfig
-
-## ContextParallelConfig
-
-[[autodoc]] ContextParallelConfig
-
-[[autodoc]] hooks.apply_context_parallel
@@ -50,7 +50,7 @@ from diffusers.utils import export_to_video
 pipeline_quant_config = PipelineQuantizationConfig(
  quant_backend="torchao",
  quant_kwargs={"quant_type": "int8wo"},
-  components_to_quantize="transformer"
+  components_to_quantize=["transformer"]
 )

 # fp8 layerwise weight-casting
@@ -54,7 +54,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
      "bnb_4bit_quant_type": "nf4",
      "bnb_4bit_compute_dtype": torch.bfloat16
      },
-    components_to_quantize="transformer"
+    components_to_quantize=["transformer"]
 )

 pipeline = HunyuanVideoPipeline.from_pretrained(
@@ -91,7 +91,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
      "bnb_4bit_quant_type": "nf4",
      "bnb_4bit_compute_dtype": torch.bfloat16
      },
-    components_to_quantize="transformer"
+    components_to_quantize=["transformer"]
 )

 pipeline = HunyuanVideoPipeline.from_pretrained(
@@ -139,7 +139,7 @@ export_to_video(video, "output.mp4", fps=15)
        "bnb_4bit_quant_type": "nf4",
        "bnb_4bit_compute_dtype": torch.bfloat16
        },
-      components_to_quantize="transformer"
+      components_to_quantize=["transformer"]
  )

  pipeline = HunyuanVideoPipeline.from_pretrained(
@@ -26,7 +26,6 @@ Qwen-Image comes in the following variants:
 |:----------:|:--------:|
 | Qwen-Image | [`Qwen/Qwen-Image`](https://huggingface.co/Qwen/Qwen-Image) |
 | Qwen-Image-Edit | [`Qwen/Qwen-Image-Edit`](https://huggingface.co/Qwen/Qwen-Image-Edit) |
-| Qwen-Image-Edit Plus | [Qwen/Qwen-Image-Edit-2509](https://huggingface.co/Qwen/Qwen-Image-Edit-2509) |

 <Tip>

@@ -97,29 +96,6 @@ The `guidance_scale` parameter in the pipeline is there to support future guidan

 </Tip>

-## Multi-image reference with QwenImageEditPlusPipeline
-
-With [`QwenImageEditPlusPipeline`], one can provide multiple images as input reference.
-
-```
-import torch
-from PIL import Image
-from diffusers import QwenImageEditPlusPipeline
-from diffusers.utils import load_image
-
-pipe = QwenImageEditPlusPipeline.from_pretrained(
-    "Qwen/Qwen-Image-Edit-2509", torch_dtype=torch.bfloat16
-).to("cuda")
-
-image_1 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/grumpy.jpg")
-image_2 = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peng.png")
-image = pipe(
-    image=[image_1, image_2], 
-    prompt="put the penguin and the cat at a game show called "Qwen Edit Plus Games"", 
-    num_inference_steps=50
-).images[0]
-```
-
 ## QwenImagePipeline

 [[autodoc]] QwenImagePipeline
@@ -144,21 +120,7 @@ image = pipe(
  - all
  - __call__

-## QwenImageEditInpaintPipeline
-
-[[autodoc]] QwenImageEditInpaintPipeline
-  - all
-  - __call__
-
-## QwenImageControlNetPipeline
-
-[[autodoc]] QwenImageControlNetPipeline
-  - all
-  - __call__
-
-## QwenImageEditPlusPipeline
-
-[[autodoc]] QwenImageEditPlusPipeline
+## QwenImaggeControlNetPipeline
  - all
  - __call__

@@ -51,10 +51,10 @@ t2i_pipeline = t2i_blocks.init_pipeline(modular_repo_id, components_manager=comp
 </hfoption>
 </hfoptions>

-Components are only loaded and registered when using [`~ModularPipeline.load_components`] or [`~ModularPipeline.load_components`]. The example below uses [`~ModularPipeline.load_components`] to create a second pipeline that reuses all the components from the first one, and assigns it to a different collection
+Components are only loaded and registered when using [`~ModularPipeline.load_components`] or [`~ModularPipeline.load_default_components`]. The example below uses [`~ModularPipeline.load_default_components`] to create a second pipeline that reuses all the components from the first one, and assigns it to a different collection

 ```py
-pipe.load_components()
+pipe.load_default_components()
 pipe2 = ModularPipeline.from_pretrained("YiYiXu/modular-demo-auto", components_manager=comp, collection="test2")
 ```

@@ -187,4 +187,4 @@ comp.enable_auto_cpu_offload(device="cuda")

 All models begin on the CPU and [`ComponentsManager`] moves them to the appropriate device right before they're needed, and moves other models back to the CPU when GPU memory is low.

-You can set your own rules for which models to offload first.
+You can set your own rules for which models to offload first.
@@ -75,13 +75,13 @@ Guiders that are already saved on the Hub with a `modular_model_index.json` file
 }
 ```

-The guider is only created after calling [`~ModularPipeline.load_components`] based on the loading specification in `modular_model_index.json`.
+The guider is only created after calling [`~ModularPipeline.load_default_components`] based on the loading specification in `modular_model_index.json`.

 ```py
 t2i_pipeline = t2i_blocks.init_pipeline("YiYiXu/modular-doc-guider")
 # not created during init
 assert t2i_pipeline.guider is None
-t2i_pipeline.load_components()
+t2i_pipeline.load_default_components()
 # loaded as PAG guider
 t2i_pipeline.guider
 ```
@@ -172,4 +172,4 @@ t2i_pipeline.push_to_hub("YiYiXu/modular-doc-guider")
 ```

 </hfoption>
-</hfoptions>
+</hfoptions>
@@ -29,7 +29,7 @@ blocks = SequentialPipelineBlocks.from_blocks_dict(TEXT2IMAGE_BLOCKS)
 modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
 pipeline = blocks.init_pipeline(modular_repo_id)

-pipeline.load_components(torch_dtype=torch.float16)
+pipeline.load_default_components(torch_dtype=torch.float16)
 pipeline.to("cuda")

 image = pipeline(prompt="Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", output="images")[0]
@@ -49,7 +49,7 @@ blocks = SequentialPipelineBlocks.from_blocks_dict(IMAGE2IMAGE_BLOCKS)
 modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
 pipeline = blocks.init_pipeline(modular_repo_id)

-pipeline.load_components(torch_dtype=torch.float16)
+pipeline.load_default_components(torch_dtype=torch.float16)
 pipeline.to("cuda")

 url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-text2img.png"
@@ -73,7 +73,7 @@ blocks = SequentialPipelineBlocks.from_blocks_dict(INPAINT_BLOCKS)
 modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
 pipeline = blocks.init_pipeline(modular_repo_id)

-pipeline.load_components(torch_dtype=torch.float16)
+pipeline.load_default_components(torch_dtype=torch.float16)
 pipeline.to("cuda")

 img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-text2img.png"
@@ -176,15 +176,15 @@ diffdiff_pipeline = ModularPipeline.from_pretrained(modular_repo_id, trust_remot

 ## Loading components

-A [`ModularPipeline`] doesn't automatically instantiate with components. It only loads the configuration and component specifications. You can load all components with [`~ModularPipeline.load_components`] or only load specific components with [`~ModularPipeline.load_components`].
+A [`ModularPipeline`] doesn't automatically instantiate with components. It only loads the configuration and component specifications. You can load all components with [`~ModularPipeline.load_default_components`] or only load specific components with [`~ModularPipeline.load_components`].

 <hfoptions id="load">
-<hfoption id="load_components">
+<hfoption id="load_default_components">

 ```py
 import torch

-t2i_pipeline.load_components(torch_dtype=torch.float16)
+t2i_pipeline.load_default_components(torch_dtype=torch.float16)
 t2i_pipeline.to("cuda")
 ```

@@ -355,4 +355,4 @@ The [config.json](https://huggingface.co/YiYiXu/modular-diffdiff-0704/blob/main/
    "ModularPipelineBlocks": "block.DiffDiffBlocks"
  }
 }
-```
+```
@@ -173,9 +173,9 @@ print(dd_blocks)

 ## ModularPipeline

-Convert the [`SequentialPipelineBlocks`] into a [`ModularPipeline`] with the [`ModularPipeline.init_pipeline`] method. This initializes the expected components to load from a `modular_model_index.json` file. Explicitly load the components by calling [`ModularPipeline.load_components`].
+Convert the [`SequentialPipelineBlocks`] into a [`ModularPipeline`] with the [`ModularPipeline.init_pipeline`] method. This initializes the expected components to load from a `modular_model_index.json` file. Explicitly load the components by calling [`ModularPipeline.load_default_components`].

-It is a good idea to initialize the [`ComponentManager`] with the pipeline to help manage the different components. Once you call [`~ModularPipeline.load_components`], the components are registered to the [`ComponentManager`] and can be shared between workflows. The example below uses the `collection` argument to assign the components a `"diffdiff"` label for better organization.
+It is a good idea to initialize the [`ComponentManager`] with the pipeline to help manage the different components. Once you call [`~ModularPipeline.load_default_components`], the components are registered to the [`ComponentManager`] and can be shared between workflows. The example below uses the `collection` argument to assign the components a `"diffdiff"` label for better organization.

 ```py
 from diffusers.modular_pipelines import ComponentsManager
@@ -209,11 +209,11 @@ Use the [`sub_blocks.insert`] method to insert it into the [`ModularPipeline`].
 dd_blocks.sub_blocks.insert("ip_adapter", ip_adapter_block, 0)
 ```

-Call [`~ModularPipeline.init_pipeline`] to initialize a [`ModularPipeline`] and use [`~ModularPipeline.load_components`] to load the model components. Load and set the IP-Adapter to run the pipeline.
+Call [`~ModularPipeline.init_pipeline`] to initialize a [`ModularPipeline`] and use [`~ModularPipeline.load_default_components`] to load the model components. Load and set the IP-Adapter to run the pipeline.

 ```py
 dd_pipeline = dd_blocks.init_pipeline("YiYiXu/modular-demo-auto", collection="diffdiff")
-dd_pipeline.load_components(torch_dtype=torch.float16)
+dd_pipeline.load_default_components(torch_dtype=torch.float16)
 dd_pipeline.loader.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
 dd_pipeline.loader.set_ip_adapter_scale(0.6)
 dd_pipeline = dd_pipeline.to(device)
@@ -260,14 +260,14 @@ class SDXLDiffDiffControlNetDenoiseStep(StableDiffusionXLDenoiseLoopWrapper):
 controlnet_denoise_block = SDXLDiffDiffControlNetDenoiseStep()
 ```

-Insert the `controlnet_input` block and replace the `denoise` block with the new `controlnet_denoise_block`. Initialize a [`ModularPipeline`] and [`~ModularPipeline.load_components`] into it.
+Insert the `controlnet_input` block and replace the `denoise` block with the new `controlnet_denoise_block`. Initialize a [`ModularPipeline`] and [`~ModularPipeline.load_default_components`] into it.

 ```py
 dd_blocks.sub_blocks.insert("controlnet_input", control_input_block, 7)
 dd_blocks.sub_blocks["denoise"] = controlnet_denoise_block

 dd_pipeline = dd_blocks.init_pipeline("YiYiXu/modular-demo-auto", collection="diffdiff")
-dd_pipeline.load_components(torch_dtype=torch.float16)
+dd_pipeline.load_default_components(torch_dtype=torch.float16)
 dd_pipeline = dd_pipeline.to(device)

 control_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/diffdiff_tomato_canny.jpeg")
@@ -320,7 +320,7 @@ Call [`SequentialPipelineBlocks.from_blocks_dict`] to create a [`SequentialPipel
 ```py
 dd_auto_blocks = SequentialPipelineBlocks.from_blocks_dict(DIFFDIFF_AUTO_BLOCKS)
 dd_pipeline = dd_auto_blocks.init_pipeline("YiYiXu/modular-demo-auto", collection="diffdiff")
-dd_pipeline.load_components(torch_dtype=torch.float16)
+dd_pipeline.load_default_components(torch_dtype=torch.float16)
 ```

 ## Share
@@ -340,5 +340,5 @@ from diffusers.modular_pipelines import ModularPipeline, ComponentsManager
 components = ComponentsManager()

 diffdiff_pipeline = ModularPipeline.from_pretrained("YiYiXu/modular-diffdiff-0704", trust_remote_code=True, components_manager=components, collection="diffdiff")
-diffdiff_pipeline.load_components(torch_dtype=torch.float16)
-```
+diffdiff_pipeline.load_default_components(torch_dtype=torch.float16)
+```
@@ -1,114 +0,0 @@
-<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# Attention backends
-
-> [!NOTE]
-> The attention dispatcher is an experimental feature. Please open an issue if you have any feedback or encounter any problems.
-
-Diffusers provides several optimized attention algorithms that are more memory and computationally efficient through it's *attention dispatcher*. The dispatcher acts as a router for managing and switching between different attention implementations and provides a unified interface for interacting with them.
-
-Refer to the table below for an overview of the available attention families and to the [Available backends](#available-backends) section for a more complete list.
-
-| attention family | main feature |
-|---|---|
-| FlashAttention | minimizes memory reads/writes through tiling and recomputation |
-| SageAttention | quantizes attention to int8 |
-| PyTorch native | built-in PyTorch implementation using [scaled_dot_product_attention](./fp16#scaled-dot-product-attention) |
-| xFormers | memory-efficient attention with support for various attention kernels |
-
-This guide will show you how to set and use the different attention backends.
-
-## set_attention_backend
-
-The [`~ModelMixin.set_attention_backend`] method iterates through all the modules in the model and sets the appropriate attention backend to use. The attention backend setting persists until [`~ModelMixin.reset_attention_backend`] is called.
-
-The example below demonstrates how to enable the `_flash_3_hub` implementation for FlashAttention-3 from the [kernel](https://github.com/huggingface/kernels) library, which allows you to instantly use optimized compute kernels from the Hub without requiring any setup.
-
-> [!NOTE]
-> FlashAttention-3 is not supported for non-Hopper architectures, in which case, use FlashAttention with `set_attention_backend("flash")`.
-
-```py
-import torch
-from diffusers import QwenImagePipeline
-
-pipeline = QwenImagePipeline.from_pretrained(
-    "Qwen/Qwen-Image", torch_dtype=torch.bfloat16, device_map="cuda"
-)
-pipeline.transformer.set_attention_backend("_flash_3_hub")
-
-prompt = """
-cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
-highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
-"""
-pipeline(prompt).images[0]
-```
-
-To restore the default attention backend, call [`~ModelMixin.reset_attention_backend`].
-
-```py
-pipeline.transformer.reset_attention_backend()
-```
-
-## attention_backend context manager
-
-The [attention_backend](https://github.com/huggingface/diffusers/blob/5e181eddfe7e44c1444a2511b0d8e21d177850a0/src/diffusers/models/attention_dispatch.py#L225) context manager temporarily sets an attention backend for a model within the context. Outside the context, the default attention (PyTorch's native scaled dot product attention) is used. This is useful if you want to use different backends for different parts of a pipeline or if you want to test the different backends.
-
-```py
-import torch
-from diffusers import QwenImagePipeline
-
-pipeline = QwenImagePipeline.from_pretrained(
-    "Qwen/Qwen-Image", torch_dtype=torch.bfloat16, device_map="cuda"
-)
-prompt = """
-cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
-highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
-"""
-
-with attention_backend("_flash_3_hub"):
-    image = pipeline(prompt).images[0]
-```
-
-> [!TIP]
-> Most attention backends support `torch.compile` without graph breaks and can be used to further speed up inference.
-
-## Available backends
-
-Refer to the table below for a complete list of available attention backends and their variants.
-
-<details>
-<summary>Expand</summary>
-
-| Backend Name | Family | Description |
-|--------------|--------|-------------|
-| `native` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | Default backend using PyTorch's scaled_dot_product_attention |
-| `flex` | [FlexAttention](https://docs.pytorch.org/docs/stable/nn.attention.flex_attention.html#module-torch.nn.attention.flex_attention) | PyTorch FlexAttention implementation |
-| `_native_cudnn` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | CuDNN-optimized attention |
-| `_native_efficient` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | Memory-efficient attention |
-| `_native_flash` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | PyTorch's FlashAttention |
-| `_native_math` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | Math-based attention (fallback) |
-| `_native_npu` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | NPU-optimized attention |
-| `_native_xla` | [PyTorch native](https://docs.pytorch.org/docs/stable/generated/torch.nn.attention.SDPBackend.html#torch.nn.attention.SDPBackend) | XLA-optimized attention |
-| `flash` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | FlashAttention-2 |
-| `flash_varlen` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | Variable length FlashAttention |
-| `_flash_3` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | FlashAttention-3 |
-| `_flash_varlen_3` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | Variable length FlashAttention-3 |
-| `_flash_3_hub` | [FlashAttention](https://github.com/Dao-AILab/flash-attention) | FlashAttention-3 from kernels |
-| `sage` | [SageAttention](https://github.com/thu-ml/SageAttention) | Quantized attention (INT8 QK) |
-| `sage_varlen` | [SageAttention](https://github.com/thu-ml/SageAttention) | Variable length SageAttention |
-| `_sage_qk_int8_pv_fp8_cuda` | [SageAttention](https://github.com/thu-ml/SageAttention) | INT8 QK + FP8 PV (CUDA) |
-| `_sage_qk_int8_pv_fp8_cuda_sm90` | [SageAttention](https://github.com/thu-ml/SageAttention) | INT8 QK + FP8 PV (SM90) |
-| `_sage_qk_int8_pv_fp16_cuda` | [SageAttention](https://github.com/thu-ml/SageAttention) | INT8 QK + FP16 PV (CUDA) |
-| `_sage_qk_int8_pv_fp16_triton` | [SageAttention](https://github.com/thu-ml/SageAttention) | INT8 QK + FP16 PV (Triton) |
-| `xformers` | [xFormers](https://github.com/facebookresearch/xformers) | Memory-efficient attention |
-
-</details>
@@ -1,270 +0,0 @@
-## CacheDiT  
-
-CacheDiT is a unified, flexible, and training-free cache acceleration framework designed to support nearly all Diffusers' DiT-based pipelines. It provides a unified cache API that supports automatic block adapter, DBCache, and more.
-
-To learn more, refer to the [CacheDiT](https://github.com/vipshop/cache-dit) repository.
-
-Install a stable release of CacheDiT from PyPI or you can install the latest version from GitHub.
-
-<hfoptions id="install">
-<hfoption id="PyPI">
-
-```bash
-pip3 install -U cache-dit
-```
-
-</hfoption>
-<hfoption id="source">
-
-```bash
-pip3 install git+https://github.com/vipshop/cache-dit.git
-```
-
-</hfoption>
-</hfoptions>
-
-Run the command below to view supported DiT pipelines.
-
-```python
->>> import cache_dit
->>> cache_dit.supported_pipelines()
-(30, ['Flux*', 'Mochi*', 'CogVideoX*', 'Wan*', 'HunyuanVideo*', 'QwenImage*', 'LTX*', 'Allegro*',
-'CogView3Plus*', 'CogView4*', 'Cosmos*', 'EasyAnimate*', 'SkyReelsV2*', 'StableDiffusion3*',
-'ConsisID*', 'DiT*', 'Amused*', 'Bria*', 'Lumina*', 'OmniGen*', 'PixArt*', 'Sana*', 'StableAudio*',
-'VisualCloze*', 'AuraFlow*', 'Chroma*', 'ShapE*', 'HiDream*', 'HunyuanDiT*', 'HunyuanDiTPAG*'])
-```
-
-For a complete benchmark, please refer to [Benchmarks](https://github.com/vipshop/cache-dit/blob/main/bench/).
-
-
-## Unified Cache API
-
-CacheDiT works by matching specific input/output patterns as shown below.
-
-![](https://github.com/vipshop/cache-dit/raw/main/assets/patterns-v1.png)
-
-Call the `enable_cache()` function on a pipeline to enable cache acceleration. This function is the entry point to many of CacheDiT's features.
-
-```python
-import cache_dit
-from diffusers import DiffusionPipeline 
-
-# Can be any diffusion pipeline
-pipe = DiffusionPipeline.from_pretrained("Qwen/Qwen-Image")
-
-# One-line code with default cache options.
-cache_dit.enable_cache(pipe) 
-
-# Just call the pipe as normal.
-output = pipe(...)
-
-# Disable cache and run original pipe.
-cache_dit.disable_cache(pipe)
-```
-
-## Automatic Block Adapter
-
-For custom or modified pipelines or transformers not included in Diffusers, use the `BlockAdapter` in `auto` mode or via manual configuration. Please check the [BlockAdapter](https://github.com/vipshop/cache-dit/blob/main/docs/User_Guide.md#automatic-block-adapter) docs for more details. Refer to [Qwen-Image w/ BlockAdapter](https://github.com/vipshop/cache-dit/blob/main/examples/adapter/run_qwen_image_adapter.py) as an example.
-
-
-```python
-from cache_dit import ForwardPattern, BlockAdapter
-
-# Use 🔥BlockAdapter with `auto` mode.
-cache_dit.enable_cache(
-    BlockAdapter(
-        # Any DiffusionPipeline, Qwen-Image, etc.  
-        pipe=pipe, auto=True,
-        # Check `📚Forward Pattern Matching` documentation and hack the code of
-        # of Qwen-Image, you will find that it has satisfied `FORWARD_PATTERN_1`.
-        forward_pattern=ForwardPattern.Pattern_1,
-    ),   
-)
-
-# Or, manually setup transformer configurations.
-cache_dit.enable_cache(
-    BlockAdapter(
-        pipe=pipe, # Qwen-Image, etc.
-        transformer=pipe.transformer,
-        blocks=pipe.transformer.transformer_blocks,
-        forward_pattern=ForwardPattern.Pattern_1,
-    ), 
-)
-```
-
-Sometimes, a Transformer class will contain more than one transformer `blocks`. For example, FLUX.1 (HiDream, Chroma, etc) contains `transformer_blocks` and `single_transformer_blocks` (with different forward patterns). The BlockAdapter is able to detect this hybrid pattern type as well. 
-Refer to [FLUX.1](https://github.com/vipshop/cache-dit/blob/main/examples/adapter/run_flux_adapter.py) as an example.
-
-```python
-# For diffusers <= 0.34.0, FLUX.1 transformer_blocks and 
-# single_transformer_blocks have different forward patterns.
-cache_dit.enable_cache(
-    BlockAdapter(
-        pipe=pipe, # FLUX.1, etc.
-        transformer=pipe.transformer,
-        blocks=[
-            pipe.transformer.transformer_blocks,
-            pipe.transformer.single_transformer_blocks,
-        ],
-        forward_pattern=[
-            ForwardPattern.Pattern_1,
-            ForwardPattern.Pattern_3,
-        ],
-    ),
-)
-```
-
-This also works if there is more than one transformer (namely `transformer` and `transformer_2`) in its structure. Refer to [Wan 2.2 MoE](https://github.com/vipshop/cache-dit/blob/main/examples/pipeline/run_wan_2.2.py) as an example.
-
-## Patch Functor
-
-For any pattern not included in CacheDiT, use the Patch Functor to convert the pattern into a known pattern. You need to subclass the Patch Functor and may also need to fuse the operations within the blocks for loop into block `forward`. After implementing a Patch Functor, set the `patch_functor` property in `BlockAdapter`.
-
-![](https://github.com/vipshop/cache-dit/raw/main/assets/patch-functor.png)
-
-Some Patch Functors are already provided in CacheDiT, [HiDreamPatchFunctor](https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/cache_factory/patch_functors/functor_hidream.py), [ChromaPatchFunctor](https://github.com/vipshop/cache-dit/blob/main/src/cache_dit/cache_factory/patch_functors/functor_chroma.py), etc.
-
-```python
-@BlockAdapterRegistry.register("HiDream")
-def hidream_adapter(pipe, **kwargs) -> BlockAdapter:
-    from diffusers import HiDreamImageTransformer2DModel
-    from cache_dit.cache_factory.patch_functors import HiDreamPatchFunctor
-
-    assert isinstance(pipe.transformer, HiDreamImageTransformer2DModel)
-    return BlockAdapter(
-        pipe=pipe,
-        transformer=pipe.transformer,
-        blocks=[
-            pipe.transformer.double_stream_blocks,
-            pipe.transformer.single_stream_blocks,
-        ],
-        forward_pattern=[
-            ForwardPattern.Pattern_0,
-            ForwardPattern.Pattern_3,
-        ],
-        # NOTE: Setup your custom patch functor here.
-        patch_functor=HiDreamPatchFunctor(),
-        **kwargs,
-    )
-```
-
-Finally, you can call the `cache_dit.summary()` function on a pipeline after its completed inference to get the cache acceleration details.
-
-```python
-stats = cache_dit.summary(pipe)
-```
-
-```python
-⚡️Cache Steps and Residual Diffs Statistics: QwenImagePipeline
-
-| Cache Steps | Diffs Min | Diffs P25 | Diffs P50 | Diffs P75 | Diffs P95 | Diffs Max |
-|-------------|-----------|-----------|-----------|-----------|-----------|-----------|
-| 23          | 0.045     | 0.084     | 0.114     | 0.147     | 0.241     | 0.297     |
-```
-
-## DBCache: Dual Block Cache  
-
-![](https://github.com/vipshop/cache-dit/raw/main/assets/dbcache-v1.png)
-
-DBCache (Dual Block Caching) supports different configurations of compute blocks (F8B12, etc.) to enable a balanced trade-off between performance and precision.
- Fn_compute_blocks: Specifies that DBCache uses the **first n** Transformer blocks to fit the information at time step t, enabling the calculation of a more stable L1 diff and delivering more accurate information to subsequent blocks.
- Bn_compute_blocks: Further fuses approximate information in the **last n** Transformer blocks to enhance prediction accuracy. These blocks act as an auto-scaler for approximate hidden states that use residual cache.
-
-
-```python
-import cache_dit
-from diffusers import FluxPipeline
-
-pipe_or_adapter = FluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    torch_dtype=torch.bfloat16,
-).to("cuda")
-
-# Default options, F8B0, 8 warmup steps, and unlimited cached 
-# steps for good balance between performance and precision
-cache_dit.enable_cache(pipe_or_adapter)
-
-# Custom options, F8B8, higher precision
-from cache_dit import BasicCacheConfig
-
-cache_dit.enable_cache(
-    pipe_or_adapter,
-    cache_config=BasicCacheConfig(
-        max_warmup_steps=8,  # steps do not cache
-        max_cached_steps=-1, # -1 means no limit
-        Fn_compute_blocks=8, # Fn, F8, etc.
-        Bn_compute_blocks=8, # Bn, B8, etc.
-        residual_diff_threshold=0.12,
-    ),
-)
-```  
-Check the [DBCache](https://github.com/vipshop/cache-dit/blob/main/docs/DBCache.md) and [User Guide](https://github.com/vipshop/cache-dit/blob/main/docs/User_Guide.md#dbcache) docs for more design details.
-
-## TaylorSeer Calibrator
-
-The [TaylorSeers](https://huggingface.co/papers/2503.06923) algorithm further improves the precision of DBCache in cases where the cached steps are large (Hybrid TaylorSeer + DBCache). At timesteps with significant intervals, the feature similarity in diffusion models decreases substantially, significantly harming the generation quality. 
-
-TaylorSeer employs a differential method to approximate the higher-order derivatives of features and predict features in future timesteps with Taylor series expansion. The TaylorSeer implemented in CacheDiT supports both hidden states and residual cache types. F_pred can be a residual cache or a hidden-state cache.
-
-```python
-from cache_dit import BasicCacheConfig, TaylorSeerCalibratorConfig
-
-cache_dit.enable_cache(
-    pipe_or_adapter,
-    # Basic DBCache w/ FnBn configurations
-    cache_config=BasicCacheConfig(
-        max_warmup_steps=8,  # steps do not cache
-        max_cached_steps=-1, # -1 means no limit
-        Fn_compute_blocks=8, # Fn, F8, etc.
-        Bn_compute_blocks=8, # Bn, B8, etc.
-        residual_diff_threshold=0.12,
-    ),
-    # Then, you can use the TaylorSeer Calibrator to approximate 
-    # the values in cached steps, taylorseer_order default is 1.
-    calibrator_config=TaylorSeerCalibratorConfig(
-        taylorseer_order=1,
-    ),
-)
-``` 
-
-> [!TIP]  
-> The `Bn_compute_blocks` parameter of DBCache can be set to `0` if you use TaylorSeer as the calibrator for approximate hidden states. DBCache's `Bn_compute_blocks` also acts as a calibrator, so you can choose either `Bn_compute_blocks` > 0 or TaylorSeer. We recommend using the configuration scheme of TaylorSeer + DBCache FnB0.
-
-## Hybrid Cache CFG
-
-CacheDiT supports caching for CFG (classifier-free guidance). For models that fuse CFG and non-CFG into a single forward step, or models that do not include CFG in the forward step, please set `enable_separate_cfg` parameter  to `False (default, None)`. Otherwise, set it to `True`. 
-
-```python
-from cache_dit import BasicCacheConfig
-
-cache_dit.enable_cache(
-    pipe_or_adapter, 
-    cache_config=BasicCacheConfig(
-        ...,
-        # For example, set it as True for Wan 2.1, Qwen-Image 
-        # and set it as False for FLUX.1, HunyuanVideo, etc.
-        enable_separate_cfg=True,
-    ),
-)
-```
-
-## torch.compile
-
-CacheDiT is designed to work with torch.compile for even better performance. Call `torch.compile` after enabling the cache.
-
-
-```python
-cache_dit.enable_cache(pipe)
-
-# Compile the Transformer module
-pipe.transformer = torch.compile(pipe.transformer)
-```
-
-If you're using CacheDiT with dynamic input shapes, consider increasing the `recompile_limit` of `torch._dynamo`. Otherwise, the `recompile_limit` error may be triggered, causing the module to fall back to eager mode. 
-
-```python
-torch._dynamo.config.recompile_limit = 96  # default is 8
-torch._dynamo.config.accumulated_recompile_limit = 2048  # default is 256
-```
-
-Please check [perf.py](https://github.com/vipshop/cache-dit/blob/main/bench/perf.py) for more details.
@@ -291,53 +291,13 @@ Group offloading moves groups of internal layers ([torch.nn.ModuleList](https://
 > [!WARNING]
 > Group offloading may not work with all models if the forward implementation contains weight-dependent device casting of inputs because it may clash with group offloading's device casting mechanism.

-Enable group offloading by configuring the `offload_type` parameter to `block_level` or `leaf_level`.
+Call [`~ModelMixin.enable_group_offload`] to enable it for standard Diffusers model components that inherit from [`ModelMixin`]. For other model components that don't inherit from [`ModelMixin`], such as a generic [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), use [`~hooks.apply_group_offloading`] instead.
+
+The `offload_type` parameter can be set to `block_level` or `leaf_level`.

 - `block_level` offloads groups of layers based on the `num_blocks_per_group` parameter. For example, if `num_blocks_per_group=2` on a model with 40 layers, 2 layers are onloaded and offloaded at a time (20 total onloads/offloads). This drastically reduces memory requirements.
 - `leaf_level` offloads individual layers at the lowest level and is equivalent to [CPU offloading](#cpu-offloading). But it can be made faster if you use streams without giving up inference speed.

-Group offloading is supported for entire pipelines or individual models. Applying group offloading to the entire pipeline is the easiest option while selectively applying it to individual models gives users more flexibility to use different offloading techniques for different models.
-
-<hfoptions id="group-offloading">
-<hfoption id="pipeline">
-
-Call [`~DiffusionPipeline.enable_group_offload`] on a pipeline.
-
-```py
-import torch
-from diffusers import CogVideoXPipeline
-from diffusers.hooks import apply_group_offloading
-from diffusers.utils import export_to_video
-
-onload_device = torch.device("cuda")
-offload_device = torch.device("cpu")
-
-pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
-pipeline.enable_group_offload(
-    onload_device=onload_device,
-    offload_device=offload_device,
-    offload_type="leaf_level",
-    use_stream=True
-)
-
-prompt = (
-    "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
-    "The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
-    "pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, "
-    "casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. "
-    "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
-    "atmosphere of this unique musical performance."
-)
-video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
-print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
-export_to_video(video, "output.mp4", fps=8)
-```
-
-</hfoption>
-<hfoption id="model">
-
-Call [`~ModelMixin.enable_group_offload`] on standard Diffusers model components that inherit from [`ModelMixin`]. For other model components that don't inherit from [`ModelMixin`], such as a generic [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), use [`~hooks.apply_group_offloading`] instead.
-
 ```py
 import torch
 from diffusers import CogVideoXPipeline
@@ -368,9 +328,6 @@ print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} G
 export_to_video(video, "output.mp4", fps=8)
 ```

-</hfoption>
-</hfoptions>
-
 #### CUDA stream

 The `use_stream` parameter can be activated for CUDA devices that support asynchronous data transfer streams to reduce overall execution time compared to [CPU offloading](#cpu-offloading). It overlaps data transfer and computation by using layer prefetching. The next layer to be executed is loaded onto the GPU while the current layer is still being executed. It can increase CPU memory significantly so ensure you have 2x the amount of memory as the model size.
@@ -1,141 +0,0 @@
-<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# NVIDIA ModelOpt
-
-[NVIDIA-ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a unified library of state-of-the-art model optimization techniques like quantization, pruning, distillation, speculative decoding, etc. It compresses deep learning models for downstream deployment frameworks like TensorRT-LLM or TensorRT to optimize inference speed.
-
-Before you begin, make sure you have nvidia_modelopt installed.
-
-```bash
-pip install -U "nvidia_modelopt[hf]"
-```
-
-Quantize a model by passing [`NVIDIAModelOptConfig`] to [`~ModelMixin.from_pretrained`] (you can also load pre-quantized models). This works for any model in any modality, as long as it supports loading with [Accelerate](https://hf.co/docs/accelerate/index) and contains `torch.nn.Linear` layers.
-
-The example below only quantizes the weights to FP8.
-
-```python
-import torch
-from diffusers import AutoModel, SanaPipeline, NVIDIAModelOptConfig
-
-model_id = "Efficient-Large-Model/Sana_600M_1024px_diffusers"
-dtype = torch.bfloat16
-
-quantization_config = NVIDIAModelOptConfig(quant_type="FP8", quant_method="modelopt")
-transformer = AutoModel.from_pretrained(
-    model_id,
-    subfolder="transformer",
-    quantization_config=quantization_config,
-    torch_dtype=dtype,
-)
-pipe = SanaPipeline.from_pretrained(
-    model_id,
-    transformer=transformer,
-    torch_dtype=dtype,
-)
-pipe.to("cuda")
-
-print(f"Pipeline memory usage: {torch.cuda.max_memory_reserved() / 1024**3:.3f} GB")
-
-prompt = "A cat holding a sign that says hello world"
-image = pipe(
-    prompt, num_inference_steps=50, guidance_scale=4.5, max_sequence_length=512
-).images[0]
-image.save("output.png")
-```
-
-> **Note:**
->
-> The quantization methods in NVIDIA-ModelOpt are designed to reduce the memory footprint of model weights using various QAT (Quantization-Aware Training) and PTQ (Post-Training Quantization) techniques while maintaining model performance. However, the actual performance gain during inference depends on the deployment framework (e.g., TRT-LLM, TensorRT) and the specific hardware configuration.  
-> 
-> More details can be found [here](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples).
-
-## NVIDIAModelOptConfig
-
-The `NVIDIAModelOptConfig` class accepts three parameters:
- `quant_type`: A string value mentioning one of the quantization types below.
- `modules_to_not_convert`: A list of module full/partial module names for which quantization should not be performed. For example, to not perform any quantization of the [`SD3Transformer2DModel`]'s pos_embed projection blocks, one would specify: `modules_to_not_convert=["pos_embed.proj.weight"]`.
- `disable_conv_quantization`: A boolean value which when set to `True` disables quantization for all convolutional layers in the model. This is useful as channel and block quantization generally don't work well with convolutional layers (used with INT4, NF4, NVFP4). If you want to disable quantization for specific convolutional layers, use `modules_to_not_convert` instead.
- `algorithm`: The algorithm to use for determining scale, defaults to `"max"`. You can check modelopt documentation for more algorithms and details.
- `forward_loop`: The forward loop function to use for calibrating activation during quantization. If not provided, it relies on static scale values computed using the weights only.
- `kwargs`: A dict of keyword arguments to pass to the underlying quantization method which will be invoked based on `quant_type`.
-
-## Supported quantization types
-
-ModelOpt supports weight-only, channel and block quantization int8, fp8, int4, nf4, and nvfp4. The quantization methods are designed to reduce the memory footprint of the model weights while maintaining the performance of the model during inference.
-
-Weight-only quantization stores the model weights in a specific low-bit data type but performs computation with a higher-precision data type, like `bfloat16`. This lowers the memory requirements from model weights but retains the memory peaks for activation computation.
-
-The quantization methods supported are as follows:
-
-| **Quantization Type** | **Supported Schemes** | **Required Kwargs** | **Additional Notes** |
-|-----------------------|-----------------------|---------------------|----------------------|
-| **INT8** | `int8 weight only`, `int8 channel quantization`, `int8 block quantization` | `quant_type`, `quant_type + channel_quantize`, `quant_type + channel_quantize + block_quantize` |
-| **FP8** | `fp8 weight only`, `fp8 channel quantization`, `fp8 block quantization` | `quant_type`, `quant_type + channel_quantize`, `quant_type + channel_quantize + block_quantize` |
-| **INT4** | `int4 weight only`, `int4 block quantization` | `quant_type`, `quant_type + channel_quantize + block_quantize` | `channel_quantize = -1 is only supported for now`|
-| **NF4** | `nf4 weight only`, `nf4 double block quantization` | `quant_type`, `quant_type + channel_quantize + block_quantize + scale_channel_quantize` + `scale_block_quantize` | `channel_quantize = -1 and scale_channel_quantize = -1 are only supported for now` |
-| **NVFP4** | `nvfp4 weight only`, `nvfp4 block quantization` | `quant_type`, `quant_type + channel_quantize + block_quantize` | `channel_quantize = -1 is only supported for now`|
-
-
-Refer to the [official modelopt documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/) for a better understanding of the available quantization methods and the exhaustive list of configuration options available.
-
-## Serializing and Deserializing quantized models
-
-To serialize a quantized model in a given dtype, first load the model with the desired quantization dtype and then save it using the [`~ModelMixin.save_pretrained`] method.
-
-```python
-import torch
-from diffusers import AutoModel, NVIDIAModelOptConfig
-from modelopt.torch.opt import enable_huggingface_checkpointing
-
-enable_huggingface_checkpointing()
-
-model_id = "Efficient-Large-Model/Sana_600M_1024px_diffusers"
-quant_config_fp8 = {"quant_type": "FP8", "quant_method": "modelopt"}
-quant_config_fp8 = NVIDIAModelOptConfig(**quant_config_fp8)
-model = AutoModel.from_pretrained(
-    model_id,
-    subfolder="transformer",
-    quantization_config=quant_config_fp8,
-    torch_dtype=torch.bfloat16,
-)
-model.save_pretrained('path/to/sana_fp8', safe_serialization=False)
-```
-
-To load a serialized quantized model, use the [`~ModelMixin.from_pretrained`] method.
-
-```python
-import torch
-from diffusers import AutoModel, NVIDIAModelOptConfig, SanaPipeline
-from modelopt.torch.opt import enable_huggingface_checkpointing
-
-enable_huggingface_checkpointing()
-
-quantization_config = NVIDIAModelOptConfig(quant_type="FP8", quant_method="modelopt")
-transformer = AutoModel.from_pretrained(
-    "path/to/sana_fp8",
-    subfolder="transformer",
-    quantization_config=quantization_config,
-    torch_dtype=torch.bfloat16,
-)
-pipe = SanaPipeline.from_pretrained(
-    "Efficient-Large-Model/Sana_600M_1024px_diffusers",
-    transformer=transformer,
-    torch_dtype=torch.bfloat16,
-)
-pipe.to("cuda")
-prompt = "A cat holding a sign that says hello world"
-image = pipe(
-    prompt, num_inference_steps=50, guidance_scale=4.5, max_sequence_length=512
-).images[0]
-image.save("output.png")
-```
@@ -34,9 +34,7 @@ Initialize [`~quantizers.PipelineQuantizationConfig`] with the following paramet
 > [!TIP]
 > These `quant_kwargs` arguments are different for each backend. Refer to the [Quantization API](../api/quantization) docs to view the arguments for each backend.

- `components_to_quantize` specifies which component(s) of the pipeline to quantize. Typically, you should quantize the most compute intensive components like the transformer. The text encoder is another component to consider quantizing if a pipeline has more than one such as [`FluxPipeline`]. The example below quantizes the T5 text encoder in [`FluxPipeline`] while keeping the CLIP model intact.
-
-   `components_to_quantize` accepts either a list for multiple models or a string for a single model.
+- `components_to_quantize` specifies which components of the pipeline to quantize. Typically, you should quantize the most compute intensive components like the transformer. The text encoder is another component to consider quantizing if a pipeline has more than one such as [`FluxPipeline`]. The example below quantizes the T5 text encoder in [`FluxPipeline`] while keeping the CLIP model intact.

 The example below loads the bitsandbytes backend with the following arguments from [`~quantizers.quantization_config.BitsAndBytesConfig`], `load_in_4bit`, `bnb_4bit_quant_type`, and `bnb_4bit_compute_dtype`.

@@ -64,7 +62,6 @@ pipe = DiffusionPipeline.from_pretrained(
 image = pipe("photo of a cute dog").images[0]
 ```

-
 ### Advanced quantization

 The `quant_mapping` argument provides more options for how to quantize each individual component in a pipeline, like combining different quantization backends.
@@ -11,96 +11,69 @@ specific language governing permissions and limitations under the License. -->

 # torchao

-[torchao](https://github.com/pytorch/ao) provides high-performance dtypes and optimizations based on quantization and sparsity for inference and training PyTorch models. It is supported for any model in any modality, as long as it supports loading with [Accelerate](https://hf.co/docs/accelerate/index) and contains `torch.nn.Linear` layers.
+[TorchAO](https://github.com/pytorch/ao) is an architecture optimization library for PyTorch. It provides high-performance dtypes, optimization techniques, and kernels for inference and training, featuring composability with native PyTorch features like [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html), FullyShardedDataParallel (FSDP), and more.

-Make sure Pytorch 2.5+ and torchao are installed with the command below.
+Before you begin, make sure you have Pytorch 2.5+ and TorchAO installed.

 ```bash
-uv pip install -U torch torchao
+pip install -U torch torchao
 ```

-Each quantization dtype is available as a separate instance of a [AOBaseConfig](https://docs.pytorch.org/ao/main/api_ref_quantization.html#inference-apis-for-quantize) class. This provides more flexible configuration options by exposing more available arguments.

-Pass the `AOBaseConfig` of a quantization dtype, like [Int4WeightOnlyConfig](https://docs.pytorch.org/ao/main/generated/torchao.quantization.Int4WeightOnlyConfig) to [`TorchAoConfig`] in [`~ModelMixin.from_pretrained`].
+Quantize a model by passing [`TorchAoConfig`] to [`~ModelMixin.from_pretrained`] (you can also load pre-quantized models). This works for any model in any modality, as long as it supports loading with [Accelerate](https://hf.co/docs/accelerate/index) and contains `torch.nn.Linear` layers.

-```py
-import torch
-from diffusers import DiffusionPipeline, PipelineQuantizationConfig, TorchAoConfig
-from torchao.quantization import Int8WeightOnlyConfig
-
-pipeline_quant_config = PipelineQuantizationConfig(
-    quant_mapping={"transformer": TorchAoConfig(Int8WeightOnlyConfig(group_size=128)))}
-)
-pipeline = DiffusionPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    quantzation_config=pipeline_quant_config,
-    torch_dtype=torch.bfloat16,
-    device_map="cuda"
-)
-```
-
-For simple use cases, you could also provide a string identifier in [`TorchAo`] as shown below.
-
-```py
-import torch
-from diffusers import DiffusionPipeline, PipelineQuantizationConfig, TorchAoConfig
-
-pipeline_quant_config = PipelineQuantizationConfig(
-    quant_mapping={"transformer": TorchAoConfig("int8wo")}
-)
-pipeline = DiffusionPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    quantzation_config=pipeline_quant_config,
-    torch_dtype=torch.bfloat16,
-    device_map="cuda"
-)
-```
-
-## torch.compile
-
-torchao supports [torch.compile](../optimization/fp16#torchcompile) which can speed up inference with one line of code.
+The example below only quantizes the weights to int8.

 ```python
 import torch
-from diffusers import DiffusionPipeline, PipelineQuantizationConfig, TorchAoConfig
-from torchao.quantization import Int4WeightOnlyConfig
+from diffusers import FluxPipeline, AutoModel, TorchAoConfig

-pipeline_quant_config = PipelineQuantizationConfig(
-    quant_mapping={"transformer": TorchAoConfig(Int4WeightOnlyConfig(group_size=128)))}
-)
-pipeline = DiffusionPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    quantzation_config=pipeline_quant_config,
-    torch_dtype=torch.bfloat16,
-    device_map="cuda"
-)
+model_id = "black-forest-labs/FLUX.1-dev"
+dtype = torch.bfloat16

-pipeline.transformer.compile(transformer, mode="max-autotune", fullgraph=True)
+quantization_config = TorchAoConfig("int8wo")
+transformer = AutoModel.from_pretrained(
+    model_id,
+    subfolder="transformer",
+    quantization_config=quantization_config,
+    torch_dtype=dtype,
+)
+pipe = FluxPipeline.from_pretrained(
+    model_id,
+    transformer=transformer,
+    torch_dtype=dtype,
+)
+pipe.to("cuda")
+
+# Without quantization: ~31.447 GB
+# With quantization: ~20.40 GB
+print(f"Pipeline memory usage: {torch.cuda.max_memory_reserved() / 1024**3:.3f} GB")
+
+prompt = "A cat holding a sign that says hello world"
+image = pipe(
+    prompt, num_inference_steps=50, guidance_scale=4.5, max_sequence_length=512
+).images[0]
+image.save("output.png")
 ```

-Refer to this [table](https://github.com/huggingface/diffusers/pull/10009#issue-2688781450) for inference speed and memory usage benchmarks with Flux and CogVideoX. More benchmarks on various hardware are also available in the torchao [repository](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks).
+TorchAO is fully compatible with [torch.compile](../optimization/fp16#torchcompile), setting it apart from other quantization methods. This makes it easy to speed up inference with just one line of code.
+
+```python
+# In the above code, add the following after initializing the transformer
+transformer = torch.compile(transformer, mode="max-autotune", fullgraph=True)
+```
+
+For speed and memory benchmarks on Flux and CogVideoX, please refer to the table [here](https://github.com/huggingface/diffusers/pull/10009#issue-2688781450). You can also find some torchao [benchmarks](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks) numbers for various hardware.

 > [!TIP]
 > The FP8 post-training quantization schemes in torchao are effective for GPUs with compute capability of at least 8.9 (RTX-4090, Hopper, etc.). FP8 often provides the best speed, memory, and quality trade-off when generating images and videos. We recommend combining FP8 and torch.compile if your GPU is compatible.

-## autoquant
+torchao also supports an automatic quantization API through [autoquant](https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md#autoquantization). Autoquantization determines the best quantization strategy applicable to a model by comparing the performance of each technique on chosen input types and shapes. Currently, this can be used directly on the underlying modeling components. Diffusers will also expose an autoquant configuration option in the future.

-torchao provides [autoquant](https://docs.pytorch.org/ao/stable/generated/torchao.quantization.autoquant.html#torchao.quantization.autoquant) an automatic quantization API. Autoquantization chooses the best quantization strategy by comparing the performance of each strategy on chosen input types and shapes. This is only supported in Diffusers for individual models at the moment.
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-from torchao.quantization import autoquant
-
-# Load the pipeline
-pipeline = DiffusionPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-schnell",
-    torch_dtype=torch.bfloat16,
-    device_map="cuda"
-)
-
-transformer = autoquant(pipeline.transformer)
-```
+The `TorchAoConfig` class accepts three parameters:
+- `quant_type`: A string value mentioning one of the quantization types below.
+- `modules_to_not_convert`: A list of module full/partial module names for which quantization should not be performed. For example, to not perform any quantization of the [`FluxTransformer2DModel`]'s first block, one would specify: `modules_to_not_convert=["single_transformer_blocks.0"]`.
+- `kwargs`: A dict of keyword arguments to pass to the underlying quantization method which will be invoked based on `quant_type`.

 ## Supported quantization types

@@ -223,7 +223,7 @@ from diffusers.image_processor import VaeImageProcessor
 import torch 

 vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=torch.bfloat16).to("cuda")
-vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
+vae_scale_factor = 2 ** (len(vae.config.block_out_channels))
 image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)

 with torch.no_grad():
@@ -12,56 +12,112 @@ specific language governing permissions and limitations under the License.

 # AutoPipeline

-[AutoPipeline](../api/models/auto_model) is a *task-and-model* pipeline that automatically selects the correct pipeline subclass based on the task. It handles the complexity of loading different pipeline subclasses without needing to know the specific pipeline subclass name.
+Diffusers provides many pipelines for basic tasks like generating images, videos, audio, and inpainting. On top of these, there are specialized pipelines for adapters and features like upscaling, super-resolution, and more. Different pipeline classes can even use the same checkpoint because they share the same pretrained model! With so many different pipelines, it can be overwhelming to know which pipeline class to use.

-This is unlike [`DiffusionPipeline`], a *model-only* pipeline that automatically selects the pipeline subclass based on the model.
+The [AutoPipeline](../api/pipelines/auto_pipeline) class is designed to simplify the variety of pipelines in Diffusers. It is a generic *task-first* pipeline that lets you focus on a task ([`AutoPipelineForText2Image`], [`AutoPipelineForImage2Image`], and [`AutoPipelineForInpainting`]) without needing to know the specific pipeline class. The [AutoPipeline](../api/pipelines/auto_pipeline) automatically detects the correct pipeline class to use.

-[`AutoPipelineForImage2Image`] returns a specific pipeline subclass, (for example, [`StableDiffusionXLImg2ImgPipeline`]), which can only be used for image-to-image tasks.
+For example, let's use the [dreamlike-art/dreamlike-photoreal-2.0](https://hf.co/dreamlike-art/dreamlike-photoreal-2.0) checkpoint.
+
+Under the hood, [AutoPipeline](../api/pipelines/auto_pipeline):
+
+1. Detects a `"stable-diffusion"` class from the [model_index.json](https://hf.co/dreamlike-art/dreamlike-photoreal-2.0/blob/main/model_index.json) file.
+2. Depending on the task you're interested in, it loads the [`StableDiffusionPipeline`], [`StableDiffusionImg2ImgPipeline`], or [`StableDiffusionInpaintPipeline`]. Any parameter (`strength`, `num_inference_steps`, etc.) you would pass to these specific pipelines can also be passed to the [AutoPipeline](../api/pipelines/auto_pipeline).
+
+<hfoptions id="autopipeline">
+<hfoption id="text-to-image">

 ```py
+from diffusers import AutoPipelineForText2Image
 import torch
-from diffusers import AutoPipelineForImage2Image

-pipeline = AutoPipelineForImage2Image.from_pretrained(
-  "RunDiffusion/Juggernaut-XL-v9", torch_dtype=torch.bfloat16, device_map="cuda",
-)
-print(pipeline)
-"StableDiffusionXLImg2ImgPipeline {
-  "_class_name": "StableDiffusionXLImg2ImgPipeline",
-  ...
-"
+pipe_txt2img = AutoPipelineForText2Image.from_pretrained(
+    "dreamlike-art/dreamlike-photoreal-2.0", torch_dtype=torch.float16, use_safetensors=True
+).to("cuda")
+
+prompt = "cinematic photo of Godzilla eating sushi with a cat in a izakaya, 35mm photograph, film, professional, 4k, highly detailed"
+generator = torch.Generator(device="cpu").manual_seed(37)
+image = pipe_txt2img(prompt, generator=generator).images[0]
+image
 ```

-Loading the same model with [`DiffusionPipeline`] returns the [`StableDiffusionXLPipeline`] subclass. It can be used for text-to-image, image-to-image, or inpainting tasks depending on the inputs.
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-text2img.png"/>
+</div>
+
+</hfoption>
+<hfoption id="image-to-image">

 ```py
+from diffusers import AutoPipelineForImage2Image
+from diffusers.utils import load_image
 import torch
-from diffusers import DiffusionPipeline

-pipeline = DiffusionPipeline.from_pretrained(
-  "RunDiffusion/Juggernaut-XL-v9", torch_dtype=torch.bfloat16, device_map="cuda",
-)
-print(pipeline)
-"StableDiffusionXLPipeline {
-  "_class_name": "StableDiffusionXLPipeline",
-  ...
-"
+pipe_img2img = AutoPipelineForImage2Image.from_pretrained(
+    "dreamlike-art/dreamlike-photoreal-2.0", torch_dtype=torch.float16, use_safetensors=True
+).to("cuda")
+
+init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-text2img.png")
+
+prompt = "cinematic photo of Godzilla eating burgers with a cat in a fast food restaurant, 35mm photograph, film, professional, 4k, highly detailed"
+generator = torch.Generator(device="cpu").manual_seed(53)
+image = pipe_img2img(prompt, image=init_image, generator=generator).images[0]
+image
 ```

-Check the [mappings](https://github.com/huggingface/diffusers/blob/130fd8df54f24ffb006d84787b598d8adc899f23/src/diffusers/pipelines/auto_pipeline.py#L114) to see whether a model is supported or not.
-
-Trying to load an unsupported model returns an error.
+Notice how the [dreamlike-art/dreamlike-photoreal-2.0](https://hf.co/dreamlike-art/dreamlike-photoreal-2.0) checkpoint is used for both text-to-image and image-to-image tasks? To save memory and avoid loading the checkpoint twice, use the [`~DiffusionPipeline.from_pipe`] method.

 ```py
+pipe_img2img = AutoPipelineForImage2Image.from_pipe(pipe_txt2img).to("cuda")
+image = pipeline(prompt, image=init_image, generator=generator).images[0]
+image
+```
+
+You can learn more about the [`~DiffusionPipeline.from_pipe`] method in the [Reuse a pipeline](../using-diffusers/loading#reuse-a-pipeline) guide.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-img2img.png"/>
+</div>
+
+</hfoption>
+<hfoption id="inpainting">
+
+```py
+from diffusers import AutoPipelineForInpainting
+from diffusers.utils import load_image
 import torch
+
+pipeline = AutoPipelineForInpainting.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True
+).to("cuda")
+
+init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-img2img.png")
+mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-mask.png")
+
+prompt = "cinematic photo of a owl, 35mm photograph, film, professional, 4k, highly detailed"
+generator = torch.Generator(device="cpu").manual_seed(38)
+image = pipeline(prompt, image=init_image, mask_image=mask_image, generator=generator, strength=0.4).images[0]
+image
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-inpaint.png"/>
+</div>
+
+</hfoption>
+</hfoptions>
+
+## Unsupported checkpoints
+
+The [AutoPipeline](../api/pipelines/auto_pipeline) supports [Stable Diffusion](../api/pipelines/stable_diffusion/overview), [Stable Diffusion XL](../api/pipelines/stable_diffusion/stable_diffusion_xl), [ControlNet](../api/pipelines/controlnet), [Kandinsky 2.1](../api/pipelines/kandinsky.md), [Kandinsky 2.2](../api/pipelines/kandinsky_v22), and [DeepFloyd IF](../api/pipelines/deepfloyd_if) checkpoints.
+
+If you try to load an unsupported checkpoint, you'll get an error.
+
+```py
 from diffusers import AutoPipelineForImage2Image
+import torch

 pipeline = AutoPipelineForImage2Image.from_pretrained(
-    "openai/shap-e-img2img", torch_dtype=torch.float16,
+    "openai/shap-e-img2img", torch_dtype=torch.float16, use_safetensors=True
 )
 "ValueError: AutoPipeline can't find a pipeline linked to ShapEImg2ImgPipeline for None"
 ```
-
-There are three types of [AutoPipeline](../api/models/auto_model) classes, [`AutoPipelineForText2Image`], [`AutoPipelineForImage2Image`] and [`AutoPipelineForInpainting`]. Each of these classes have a predefined mapping, linking a pipeline to their task-specific subclass.
-
-When [`~AutoPipelineForText2Image.from_pretrained`] is called, it extracts the class name from the `model_index.json` file and selects the appropriate pipeline subclass for the task based on the mapping.
@@ -10,7 +10,13 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# FreeU
+# Controlling image quality
+
+The components of a diffusion model, like the UNet and scheduler, can be optimized to improve the quality of generated images leading to better details. These techniques are especially useful if you don't have the resources to simply use a larger model for inference. You can enable these techniques during inference without any additional training.
+
+This guide will show you how to turn these techniques on in your pipeline and how to configure them to improve the quality of your generated images.
+
+## Details

 [FreeU](https://hf.co/papers/2309.11497) improves image details by rebalancing the UNet's backbone and skip connection weights. The skip connections can cause the model to overlook some of the backbone semantics which may lead to unnatural image details in the generated image. This technique does not require any additional training and can be applied on the fly during inference for tasks like image-to-image and text-to-video.

@@ -133,7 +139,7 @@ export_to_video(video_frames, "teddy_bear.mp4", fps=10)
 </hfoption>
 </hfoptions>

-Call the [`~pipelines.StableDiffusionMixin.disable_freeu`] method to disable FreeU.
+Call the [`pipelines.StableDiffusionMixin.disable_freeu`] method to disable FreeU.

 ```py
 pipeline.disable_freeu()
@@ -108,20 +108,23 @@ print(pipeline.transformer.dtype, pipeline.vae.dtype)

 The `device_map` argument determines individual model or pipeline placement on an accelerator like a GPU. It is especially helpful when there are multiple GPUs.

-A pipeline supports two options for `device_map`, `"cuda"` and `"balanced"`. Refer to the table below to compare the placement strategies.
+Diffusers currently provides three options to `device_map`, `"cuda"`, `"balanced"` and `"auto"`. Refer to the table below to compare the three placement strategies.

 | parameter | description |
 |---|---|
-| `"cuda"` | places pipeline on a supported accelerator device like CUDA |
-| `"balanced"` | evenly distributes pipeline on all GPUs |
+| `"cuda"` | places model or pipeline on CUDA device |
+| `"balanced"` | evenly distributes model or pipeline on all GPUs |
+| `"auto"` | distribute model from fastest device first to slowest |

 Use the `max_memory` argument in [`~DiffusionPipeline.from_pretrained`] to allocate a maximum amount of memory to use on each device. By default, Diffusers uses the maximum amount available.

+<hfoptions id="device_map">
+<hfoption id="pipeline">
+
 ```py
 import torch
 from diffusers import DiffusionPipeline

-max_memory = {0: "16GB", 1: "16GB"}
 pipeline = DiffusionPipeline.from_pretrained(
  "Qwen/Qwen-Image", 
  torch_dtype=torch.bfloat16,
@@ -129,6 +132,26 @@ pipeline = DiffusionPipeline.from_pretrained(
 )
 ```

+</hfoption>
+<hfoption id="individual model">
+
+```py
+import torch
+from diffusers import AutoModel
+
+max_memory = {0: "16GB", 1: "16GB"}
+transformer = AutoModel.from_pretrained(
+    "Qwen/Qwen-Image", 
+    subfolder="transformer",
+    torch_dtype=torch.bfloat16
+    device_map="cuda",
+    max_memory=max_memory
+)
+```
+
+</hfoption>
+</hfoptions>
+
 The `hf_device_map` attribute allows you to access and view the `device_map`.

 ```py
@@ -166,18 +189,22 @@ pipeline = DiffusionPipeline.from_pretrained(

 [`DiffusionPipeline`] is flexible and accommodates loading different models or schedulers. You can experiment with different schedulers to optimize for generation speed or quality, and you can replace models with more performant ones.

-The example below uses a more stable VAE version.
+The example below swaps the default scheduler to generate higher quality images and a more stable VAE version. Pass the `subfolder` argument in [`~HeunDiscreteScheduler.from_pretrained`] to load the scheduler to the correct subfolder.

 ```py
 import torch
-from diffusers import DiffusionPipeline, AutoModel
+from diffusers import DiffusionPipeline, HeunDiscreteScheduler, AutoModel

+scheduler = HeunDiscreteScheduler.from_pretrained(
+  "stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler"
+)
 vae = AutoModel.from_pretrained(
  "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
 )

 pipeline = DiffusionPipeline.from_pretrained(
  "stabilityai/stable-diffusion-xl-base-1.0",
+  scheduler=scheduler,
  vae=vae,
  torch_dtype=torch.float16,
  device_map="cuda"
@@ -10,22 +10,19 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+# Push files to the Hub
+
 [[open-in-colab]]

-# Sharing pipelines and models
-
-Share your pipeline or models and schedulers on the Hub with the [`~diffusers.utils.PushToHubMixin`] class. This class:
+🤗 Diffusers provides a [`~diffusers.utils.PushToHubMixin`] for uploading your model, scheduler, or pipeline to the Hub. It is an easy way to store your files on the Hub, and also allows you to share your work with others. Under the hood, the [`~diffusers.utils.PushToHubMixin`]:

 1. creates a repository on the Hub
 2. saves your model, scheduler, or pipeline files so they can be reloaded later
 3. uploads folder containing these files to the Hub

-This guide will show you how to upload your files to the Hub with the [`~diffusers.utils.PushToHubMixin`] class.
+This guide will show you how to use the [`~diffusers.utils.PushToHubMixin`] to upload your files to the Hub.

-Log in to your Hugging Face account with your access [token](https://huggingface.co/settings/tokens).
-
-<hfoptions id="login">
-<hfoption id="notebook">
+You'll need to log in to your Hub account with your access [token](https://huggingface.co/settings/tokens) first:

 ```py
 from huggingface_hub import notebook_login
@@ -33,19 +30,9 @@ from huggingface_hub import notebook_login
 notebook_login()
 ```

-</hfoption>
-<hfoption id="hf CLI">
-
-```bash
-hf auth login
-```
-
-</hfoption>
-</hfoptions>
-
 ## Models

-To push a model to the Hub, call [`~diffusers.utils.PushToHubMixin.push_to_hub`] and specify the repository id of the model.
+To push a model to the Hub, call [`~diffusers.utils.PushToHubMixin.push_to_hub`] and specify the repository id of the model to be stored on the Hub:

 ```py
 from diffusers import ControlNetModel
@@ -61,9 +48,15 @@ controlnet = ControlNetModel(
 controlnet.push_to_hub("my-controlnet-model")
 ```

-The [`~diffusers.utils.PushToHubMixin.push_to_hub`] method saves the model's `config.json` file and the weights are automatically saved as safetensors files.
+For models, you can also specify the [*variant*](loading#checkpoint-variants) of the weights to push to the Hub. For example, to push `fp16` weights:

-Load the model again with [`~DiffusionPipeline.from_pretrained`].
+```py
+controlnet.push_to_hub("my-controlnet-model", variant="fp16")
+```
+
+The [`~diffusers.utils.PushToHubMixin.push_to_hub`] function saves the model's `config.json` file and the weights are automatically saved in the `safetensors` format.
+
+Now you can reload the model from your repository on the Hub:

 ```py
 model = ControlNetModel.from_pretrained("your-namespace/my-controlnet-model")
@@ -71,7 +64,7 @@ model = ControlNetModel.from_pretrained("your-namespace/my-controlnet-model")

 ## Scheduler

-To push a scheduler to the Hub, call [`~diffusers.utils.PushToHubMixin.push_to_hub`] and specify the repository id of the scheduler.
+To push a scheduler to the Hub, call [`~diffusers.utils.PushToHubMixin.push_to_hub`] and specify the repository id of the scheduler to be stored on the Hub:

 ```py
 from diffusers import DDIMScheduler
@@ -88,7 +81,7 @@ scheduler.push_to_hub("my-controlnet-scheduler")

 The [`~diffusers.utils.PushToHubMixin.push_to_hub`] function saves the scheduler's `scheduler_config.json` file to the specified repository.

-Load the scheduler again with [`~SchedulerMixin.from_pretrained`].
+Now you can reload the scheduler from your repository on the Hub:

 ```py
 scheduler = DDIMScheduler.from_pretrained("your-namepsace/my-controlnet-scheduler")
@@ -96,7 +89,7 @@ scheduler = DDIMScheduler.from_pretrained("your-namepsace/my-controlnet-schedule

 ## Pipeline

-To push a pipeline to the Hub, initialize the pipeline components with your desired parameters.
+You can also push an entire pipeline with all it's components to the Hub. For example, initialize the components of a [`StableDiffusionPipeline`] with the parameters you want:

 ```py
 from diffusers import (
@@ -150,7 +143,7 @@ text_encoder = CLIPTextModel(text_encoder_config)
 tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
 ```

-Pass all components to the pipeline and call [`~diffusers.utils.PushToHubMixin.push_to_hub`].
+Pass all of the components to the [`StableDiffusionPipeline`] and call [`~diffusers.utils.PushToHubMixin.push_to_hub`] to push the pipeline to the Hub:

 ```py
 components = {
@@ -167,7 +160,7 @@ pipeline = StableDiffusionPipeline(**components)
 pipeline.push_to_hub("my-pipeline")
 ```

-The [`~diffusers.utils.PushToHubMixin.push_to_hub`] method saves each component to a subfolder in the repository. Load the pipeline again with [`~DiffusionPipeline.from_pretrained`].
+The [`~diffusers.utils.PushToHubMixin.push_to_hub`] function saves each component to a subfolder in the repository. Now you can reload the pipeline from your repository on the Hub:

 ```py
 pipeline = StableDiffusionPipeline.from_pretrained("your-namespace/my-pipeline")
@@ -175,10 +168,10 @@ pipeline = StableDiffusionPipeline.from_pretrained("your-namespace/my-pipeline")

 ## Privacy

-Set `private=True` in [`~diffusers.utils.PushToHubMixin.push_to_hub`] to keep a model, scheduler, or pipeline files private.
+Set `private=True` in the [`~diffusers.utils.PushToHubMixin.push_to_hub`] function to keep your model, scheduler, or pipeline files private:

 ```py
 controlnet.push_to_hub("my-controlnet-model-private", private=True)
 ```

-Private repositories are only visible to you. Other users won't be able to clone the repository and it won't appear in search results. Even if a user has the URL to your private repository, they'll receive a `404 - Sorry, we can't find the page you are looking for`. You must be [logged in](https://huggingface.co/docs/huggingface_hub/quick-start#login) to load a model from a private repository.
+Private repositories are only visible to you, and other users won't be able to clone the repository and your repository won't appear in search results. Even if a user has the URL to your private repository, they'll receive a `404 - Sorry, we can't find the page you are looking for`. You must be [logged in](https://huggingface.co/docs/huggingface_hub/quick-start#login) to load a model from a private repository.
@@ -0,0 +1,235 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Scheduler features
+
+The scheduler is an important component of any diffusion model because it controls the entire denoising (or sampling) process. There are many types of schedulers, some are optimized for speed and some for quality. With Diffusers, you can modify the scheduler configuration to use custom noise schedules, sigmas, and rescale the noise schedule. Changing these parameters can have profound effects on inference quality and speed.
+
+This guide will demonstrate how to use these features to improve inference quality.
+
+> [!TIP]
+> Diffusers currently only supports the `timesteps` and `sigmas` parameters for a select list of schedulers and pipelines. Feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you want to extend these parameters to a scheduler and pipeline that does not currently support it!
+
+## Timestep schedules
+
+The timestep or noise schedule determines the amount of noise at each sampling step. The scheduler uses this to generate an image with the corresponding amount of noise at each step. The timestep schedule is generated from the scheduler's default configuration, but you can customize the scheduler to use new and optimized sampling schedules that aren't in Diffusers yet.
+
+For example, [Align Your Steps (AYS)](https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/) is a method for optimizing a sampling schedule to generate a high-quality image in as little as 10 steps. The optimal [10-step schedule](https://github.com/huggingface/diffusers/blob/a7bf77fc284810483f1e60afe34d1d27ad91ce2e/src/diffusers/schedulers/scheduling_utils.py#L51) for Stable Diffusion XL is:
+
+```py
+from diffusers.schedulers import AysSchedules
+
+sampling_schedule = AysSchedules["StableDiffusionXLTimesteps"]
+print(sampling_schedule)
+"[999, 845, 730, 587, 443, 310, 193, 116, 53, 13]"
+```
+
+You can use the AYS sampling schedule in a pipeline by passing it to the `timesteps` parameter.
+
+```py
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "SG161222/RealVisXL_V4.0",
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to("cuda")
+pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, algorithm_type="sde-dpmsolver++")
+
+prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up"
+generator = torch.Generator(device="cpu").manual_seed(2487854446)
+image = pipeline(
+    prompt=prompt,
+    negative_prompt="",
+    generator=generator,
+    timesteps=sampling_schedule,
+).images[0]
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ays.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">AYS timestep schedule 10 steps</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/10.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">Linearly-spaced timestep schedule 10 steps</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/25.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">Linearly-spaced timestep schedule 25 steps</figcaption>
+  </div>
+</div>
+
+## Timestep spacing
+
+The way sample steps are selected in the schedule can affect the quality of the generated image, especially with respect to [rescaling the noise schedule](#rescale-noise-schedule), which can enable a model to generate much brighter or darker images. Diffusers provides three timestep spacing methods:
+
+- `leading` creates evenly spaced steps
+- `linspace` includes the first and last steps and evenly selects the remaining intermediate steps
+- `trailing` only includes the last step and evenly selects the remaining intermediate steps starting from the end
+
+It is recommended to use the `trailing` spacing method because it generates higher quality images with more details when there are fewer sample steps. But the difference in quality is not as obvious for more standard sample step values.
+
+```py
+import torch
+from diffusers import StableDiffusionXLPipeline, DPMSolverMultistepScheduler
+
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "SG161222/RealVisXL_V4.0",
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to("cuda")
+pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, timestep_spacing="trailing")
+
+prompt = "A cinematic shot of a cute little black cat sitting on a pumpkin at night"
+generator = torch.Generator(device="cpu").manual_seed(2487854446)
+image = pipeline(
+    prompt=prompt,
+    negative_prompt="",
+    generator=generator,
+    num_inference_steps=5,
+).images[0]
+image
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/trailing_spacing.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">trailing spacing after 5 steps</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/leading_spacing.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">leading spacing after 5 steps</figcaption>
+  </div>
+</div>
+
+## Sigmas
+
+The `sigmas` parameter is the amount of noise added at each timestep according to the timestep schedule. Like the `timesteps` parameter, you can customize the `sigmas` parameter to control how much noise is added at each step. When you use a custom `sigmas` value, the `timesteps` are calculated from the custom `sigmas` value and the default scheduler configuration is ignored.
+
+For example, you can manually pass the [sigmas](https://github.com/huggingface/diffusers/blob/6529ee67ec02fcf58d2fd9242164ea002b351d75/src/diffusers/schedulers/scheduling_utils.py#L55) for something like the 10-step AYS schedule from before to the pipeline.
+
+```py
+import torch
+
+from diffusers import DiffusionPipeline, EulerDiscreteScheduler
+
+model_id = "stabilityai/stable-diffusion-xl-base-1.0"
+pipeline = DiffusionPipeline.from_pretrained(
+  "stabilityai/stable-diffusion-xl-base-1.0",
+  torch_dtype=torch.float16,
+  variant="fp16",
+).to("cuda")
+pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
+
+sigmas = [14.615, 6.315, 3.771, 2.181, 1.342, 0.862, 0.555, 0.380, 0.234, 0.113, 0.0]
+prompt = "anthropomorphic capybara wearing a suit and working with a computer"
+generator = torch.Generator(device='cuda').manual_seed(123)
+image = pipeline(
+    prompt=prompt,
+    num_inference_steps=10,
+    sigmas=sigmas,
+    generator=generator
+).images[0]
+```
+
+When you take a look at the scheduler's `timesteps` parameter, you'll see that it is the same as the AYS timestep schedule because the `timestep` schedule is calculated from the `sigmas`.
+
+```py
+print(f" timesteps: {pipe.scheduler.timesteps}")
+"timesteps: tensor([999., 845., 730., 587., 443., 310., 193., 116.,  53.,  13.], device='cuda:0')"
+```
+
+### Karras sigmas
+
+> [!TIP]
+> Refer to the scheduler API [overview](../api/schedulers/overview) for a list of schedulers that support Karras sigmas.
+>
+> Karras sigmas should not be used for models that weren't trained with them. For example, the base Stable Diffusion XL model shouldn't use Karras sigmas but the [DreamShaperXL](https://hf.co/Lykon/dreamshaper-xl-1-0) model can since they are trained with Karras sigmas.
+
+Karras scheduler's use the timestep schedule and sigmas from the [Elucidating the Design Space of Diffusion-Based Generative Models](https://hf.co/papers/2206.00364) paper. This scheduler variant applies a smaller amount of noise per step as it approaches the end of the sampling process compared to other schedulers, and can increase the level of details in the generated image.
+
+Enable Karras sigmas by setting `use_karras_sigmas=True` in the scheduler.
+
+```py
+import torch
+from diffusers import StableDiffusionXLPipeline, DPMSolverMultistepScheduler
+
+pipeline = StableDiffusionXLPipeline.from_pretrained(
+    "SG161222/RealVisXL_V4.0",
+    torch_dtype=torch.float16,
+    variant="fp16",
+).to("cuda")
+pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, algorithm_type="sde-dpmsolver++", use_karras_sigmas=True)
+
+prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up"
+generator = torch.Generator(device="cpu").manual_seed(2487854446)
+image = pipeline(
+    prompt=prompt,
+    negative_prompt="",
+    generator=generator,
+).images[0]
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/karras_sigmas_true.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">Karras sigmas enabled</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/karras_sigmas_false.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">Karras sigmas disabled</figcaption>
+  </div>
+</div>
+
+## Rescale noise schedule
+
+In the [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://hf.co/papers/2305.08891) paper, the authors discovered that common noise schedules allowed some signal to leak into the last timestep. This signal leakage at inference can cause models to only generate images with medium brightness. By enforcing a zero signal-to-noise ratio (SNR) for the timstep schedule and sampling from the last timestep, the model can be improved to generate very bright or dark images.
+
+> [!TIP]
+> For inference, you need a model that has been trained with *v_prediction*. To train your own model with *v_prediction*, add the following flag to the [train_text_to_image.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) or [train_text_to_image_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image_lora.py) scripts.
+>
+> ```bash
+> --prediction_type="v_prediction"
+> ```
+
+For example, load the [ptx0/pseudo-journey-v2](https://hf.co/ptx0/pseudo-journey-v2) checkpoint which was trained with `v_prediction` and the [`DDIMScheduler`]. Configure the following parameters in the [`DDIMScheduler`]:
+
+* `rescale_betas_zero_snr=True` to rescale the noise schedule to zero SNR
+* `timestep_spacing="trailing"` to start sampling from the last timestep
+
+Set `guidance_rescale` in the pipeline to prevent over-exposure. A lower value increases brightness but some of the details may appear washed out.
+
+```py
+from diffusers import DiffusionPipeline, DDIMScheduler
+
+pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", use_safetensors=True)
+
+pipeline.scheduler = DDIMScheduler.from_config(
+    pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
+)
+pipeline.to("cuda")
+prompt = "cinematic photo of a snowy mountain at night with the northern lights aurora borealis overhead, 35mm photograph, film, professional, 4k, highly detailed"
+generator = torch.Generator(device="cpu").manual_seed(23)
+image = pipeline(prompt, guidance_rescale=0.7, generator=generator).images[0]
+image
+```
+
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/no-zero-snr.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">default Stable Diffusion v2-1 image</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/zero-snr.png"/>
+    <figcaption class="mt-2 text-center text-sm text-gray-500">image with zero SNR and trailing timestep spacing enabled</figcaption>
+  </div>
+</div>
@@ -10,273 +10,200 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

+# Load schedulers and models
+
 [[open-in-colab]]

-# Schedulers
+Diffusion pipelines are a collection of interchangeable schedulers and models that can be mixed and matched to tailor a pipeline to a specific use case. The scheduler encapsulates the entire denoising process such as the number of denoising steps and the algorithm for finding the denoised sample. A scheduler is not parameterized or trained so they don't take very much memory. The model is usually only concerned with the forward pass of going from a noisy input to a less noisy sample.

-A scheduler is an algorithm that provides instructions to the denoising process such as how much noise to remove at a certain step. It takes the model prediction from step *t* and applies an update for how to compute the next sample at step *t-1*. Different schedulers produce different results; some are faster while others are more accurate.
-
-Diffusers supports many schedulers and allows you to modify their timestep schedules, timestep spacing, and more, to generate high-quality images in fewer steps.
-
-This guide will show you how to load and customize schedulers.
-
-## Loading schedulers
-
-Schedulers don't have any parameters and are defined in a configuration file. Access the `.scheduler` attribute of a pipeline to view the configuration.
+This guide will show you how to load schedulers and models to customize a pipeline. You'll use the [stable-diffusion-v1-5/stable-diffusion-v1-5](https://hf.co/stable-diffusion-v1-5/stable-diffusion-v1-5) checkpoint throughout this guide, so let's load it first.

 ```py
 import torch
 from diffusers import DiffusionPipeline

 pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, device_map="cuda"
-)
-pipeline.scheduler
+    "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
+).to("cuda")
 ```

-Load a different scheduler with [`~SchedulerMixin.from_pretrained`] and specify the `subfolder` argument to load the configuration file into the correct subfolder of the pipeline repository. Pass the new scheduler to the existing pipeline.
+You can see what scheduler this pipeline uses with the `pipeline.scheduler` attribute.
+
+```py
+pipeline.scheduler
+PNDMScheduler {
+  "_class_name": "PNDMScheduler",
+  "_diffusers_version": "0.21.4",
+  "beta_end": 0.012,
+  "beta_schedule": "scaled_linear",
+  "beta_start": 0.00085,
+  "clip_sample": false,
+  "num_train_timesteps": 1000,
+  "set_alpha_to_one": false,
+  "skip_prk_steps": true,
+  "steps_offset": 1,
+  "timestep_spacing": "leading",
+  "trained_betas": null
+}
+```
+
+## Load a scheduler
+
+Schedulers are defined by a configuration file that can be used by a variety of schedulers. Load a scheduler with the [`SchedulerMixin.from_pretrained`] method, and specify the `subfolder` parameter to load the configuration file into the correct subfolder of the pipeline repository.
+
+For example, to load the [`DDIMScheduler`]:
+
+```py
+from diffusers import DDIMScheduler, DiffusionPipeline
+
+ddim = DDIMScheduler.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="scheduler")
+```
+
+Then you can pass the newly loaded scheduler to the pipeline.
+
+```python
+pipeline = DiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5", scheduler=ddim, torch_dtype=torch.float16, use_safetensors=True
+).to("cuda")
+```
+
+## Compare schedulers
+
+Schedulers have their own unique strengths and weaknesses, making it difficult to quantitatively compare which scheduler works best for a pipeline. You typically have to make a trade-off between denoising speed and denoising quality. We recommend trying out different schedulers to find one that works best for your use case. Call the `pipeline.scheduler.compatibles` attribute to see what schedulers are compatible with a pipeline.
+
+Let's compare the [`LMSDiscreteScheduler`], [`EulerDiscreteScheduler`], [`EulerAncestralDiscreteScheduler`], and the [`DPMSolverMultistepScheduler`] on the following prompt and seed.
+
+```py
+import torch
+from diffusers import DiffusionPipeline
+
+pipeline = DiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
+).to("cuda")
+
+prompt = "A photograph of an astronaut riding a horse on Mars, high resolution, high definition."
+generator = torch.Generator(device="cuda").manual_seed(8)
+```
+
+To change the pipelines scheduler, use the [`~ConfigMixin.from_config`] method to load a different scheduler's `pipeline.scheduler.config` into the pipeline.
+
+<hfoptions id="schedulers">
+<hfoption id="LMSDiscreteScheduler">
+
+[`LMSDiscreteScheduler`] typically generates higher quality images than the default scheduler.
+
+```py
+from diffusers import LMSDiscreteScheduler
+
+pipeline.scheduler = LMSDiscreteScheduler.from_config(pipeline.scheduler.config)
+image = pipeline(prompt, generator=generator).images[0]
+image
+```
+
+</hfoption>
+<hfoption id="EulerDiscreteScheduler">
+
+[`EulerDiscreteScheduler`] can generate higher quality images in just 30 steps.
+
+```py
+from diffusers import EulerDiscreteScheduler
+
+pipeline.scheduler = EulerDiscreteScheduler.from_config(pipeline.scheduler.config)
+image = pipeline(prompt, generator=generator).images[0]
+image
+```
+
+</hfoption>
+<hfoption id="EulerAncestralDiscreteScheduler">
+
+[`EulerAncestralDiscreteScheduler`] can generate higher quality images in just 30 steps.
+
+```py
+from diffusers import EulerAncestralDiscreteScheduler
+
+pipeline.scheduler = EulerAncestralDiscreteScheduler.from_config(pipeline.scheduler.config)
+image = pipeline(prompt, generator=generator).images[0]
+image
+```
+
+</hfoption>
+<hfoption id="DPMSolverMultistepScheduler">
+
+[`DPMSolverMultistepScheduler`] provides a balance between speed and quality and can generate higher quality images in just 20 steps.

 ```py
 from diffusers import DPMSolverMultistepScheduler

-dpm = DPMSolverMultistepScheduler.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler"
-)
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    scheduler=dpm,
-    torch_dtype=torch.float16,
-    device_map="cuda"
-)
-pipeline.scheduler
-```
-
-## Timestep schedules
-
-Timestep or noise schedule decides how noise is distributed over the denoising process. The schedule can be linear or more concentrated toward the beginning or end. It is a precomputed sequence of noise levels generated from the scheduler's default configuration, but it can be customized to use other schedules.
-
-> [!TIP]
-> The `timesteps` argument is only supported for a select list of schedulers and pipelines. Feel free to open a feature request if you want to extend these parameters to a scheduler and pipeline that does not currently support it!
-
-The example below uses the [Align Your Steps (AYS)](https://research.nvidia.com/labs/toronto-ai/AlignYourSteps/) schedule which can generate a high-quality image in 10 steps, significantly speeding up generation and reducing computation time.
-
-Import the schedule and pass it to the `timesteps` argument in the pipeline.
-
-```py
-import torch
-from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
-from diffusers.schedulers import AysSchedules
-
-sampling_schedule = AysSchedules["StableDiffusionXLTimesteps"]
-print(sampling_schedule)
-"[999, 845, 730, 587, 443, 310, 193, 116, 53, 13]"
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "SG161222/RealVisXL_V4.0",
-    torch_dtype=torch.float16,
-    device_map="cuda"
-)
-pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
-  pipeline.scheduler.config, algorithm_type="sde-dpmsolver++"
-)
-
-prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up"
-image = pipeline(
-    prompt=prompt,
-    negative_prompt="",
-    timesteps=sampling_schedule,
-).images[0]
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ays.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">AYS timestep schedule 10 steps</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/10.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">Linearly-spaced timestep schedule 10 steps</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/25.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">Linearly-spaced timestep schedule 25 steps</figcaption>
-  </div>
-</div>
-
-### Rescaling schedules
-
-Denoising should begin with pure noise and the signal-to-noise (SNR) ration should be zero. However, some models don't actually start from pure noise which makes it difficult to generate images at brightness extremes.
-
-> [!TIP]
-> Train your own model with `v_prediction` by adding the `--prediction_type="v_prediction"` flag to your training script. You can also [search](https://huggingface.co/search/full-text?q=v_prediction&type=model) for existing models trained with `v_prediction`.
-
-To fix this, a model must be trained with `v_prediction`. If a model is trained with `v_prediction`, then enable the following arguments in the scheduler.
-
- Set `rescale_betas_zero_snr=True` to rescale the noise schedule to the very last timestep with exactly zero SNR
- Set `timestep_spacing="trailing"` to force sampling from the last timestep with pure noise
-
-```py
-from diffusers import DiffusionPipeline, DDIMScheduler
-
-pipeline = DiffusionPipeline.from_pretrained("ptx0/pseudo-journey-v2", device_map="cuda")
-
-pipeline.scheduler = DDIMScheduler.from_config(
-    pipeline.scheduler.config, rescale_betas_zero_snr=True, timestep_spacing="trailing"
-)
-```
-
-Set `guidance_rescale` in the pipeline to avoid overexposed images. A lower value increases brightness, but some details may appear washed out.
-
-```py
-prompt = """
-cinematic photo of a snowy mountain at night with the northern lights aurora borealis
-overhead, 35mm photograph, film, professional, 4k, highly detailed
-"""
-image = pipeline(prompt, guidance_rescale=0.7).images[0]
-```
-
-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/no-zero-snr.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">default Stable Diffusion v2-1 image</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/zero-snr.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">image with zero SNR and trailing timestep spacing enabled</figcaption>
-  </div>
-</div>
-
-## Timestep spacing
-
-Timestep spacing refers to the specific steps *t* to sample from from the schedule. Diffusers provides three spacing types as shown below.
-
-| spacing strategy | spacing calculation | example timesteps |
-|---|---|---|
-| `leading` | evenly spaced steps | `[900, 800, 700, ..., 100, 0]` |
-| `linspace` | include first and last steps and evenly divide remaining intermediate steps | `[1000, 888.89, 777.78, ..., 111.11, 0]` |
-| `trailing` | include last step and evenly divide remaining intermediate steps beginning from the end | `[999, 899, 799, 699, 599, 499, 399, 299, 199, 99]` |
-
-Pass the spacing strategy to the `timestep_spacing` argument in the scheduler.
-
-> [!TIP]
-> The `trailing` strategy typically produces higher quality images with more details with fewer steps, but the difference in quality is not as obvious for more standard step values.
-
-```py
-import torch
-from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "SG161222/RealVisXL_V4.0",
-    torch_dtype=torch.float16,
-    device_map="cuda"
-)
-pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
-  pipeline.scheduler.config, timestep_spacing="trailing"
-)
-
-prompt = "A cinematic shot of a cute little black cat sitting on a pumpkin at night"
-image = pipeline(
-    prompt=prompt,
-    negative_prompt="",
-    num_inference_steps=5,
-).images[0]
+pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config)
+image = pipeline(prompt, generator=generator).images[0]
 image
 ```

-<div class="flex gap-4">
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/trailing_spacing.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">trailing spacing after 5 steps</figcaption>
-  </div>
-  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/leading_spacing.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">leading spacing after 5 steps</figcaption>
-  </div>
-</div>
-
-## Sigmas
-
-Sigmas is a measure of how noisy a sample is at a certain step as defined by the schedule. When using custom `sigmas`, the `timesteps` are calculated from these values instead of the default scheduler configuration.
-
-> [!TIP]
-> The `sigmas` argument is only supported for a select list of schedulers and pipelines. Feel free to open a feature request if you want to extend these parameters to a scheduler and pipeline that does not currently support it!
-
-Pass the custom sigmas to the `sigmas` argument in the pipeline. The example below uses the [sigmas](https://github.com/huggingface/diffusers/blob/6529ee67ec02fcf58d2fd9242164ea002b351d75/src/diffusers/schedulers/scheduling_utils.py#L55) from the 10-step AYS schedule.
-
-```py
-import torch
-from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "SG161222/RealVisXL_V4.0",
-    torch_dtype=torch.float16,
-    device_map="cuda"
-)
-pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
-  pipeline.scheduler.config, algorithm_type="sde-dpmsolver++"
-)
-
-sigmas = [14.615, 6.315, 3.771, 2.181, 1.342, 0.862, 0.555, 0.380, 0.234, 0.113, 0.0]
-prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up"
-image = pipeline(
-    prompt=prompt,
-    negative_prompt="",
-    sigmas=sigmas,
-).images[0]
-```
-
-### Karras sigmas
-
-[Karras sigmas](https://huggingface.co/papers/2206.00364) resamples the noise schedule for more efficient sampling by clustering sigmas more densely in the middle of the sequence where structure reconstruction is critical, while using fewer sigmas at the beginning and end where noise changes have less impact. This can increase the level of details in a generated image.
-
-Set `use_karras_sigmas=True` in the scheduler to enable it.
-
-```py
-import torch
-from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
-
-pipeline = DiffusionPipeline.from_pretrained(
-    "SG161222/RealVisXL_V4.0",
-    torch_dtype=torch.float16,
-    device_map="cuda"
-)
-pipeline.scheduler = DPMSolverMultistepScheduler.from_config(
-  pipeline.scheduler.config,
-  algorithm_type="sde-dpmsolver++",
-  use_karras_sigmas=True,
-)
-
-prompt = "A cinematic shot of a cute little rabbit wearing a jacket and doing a thumbs up"
-image = pipeline(
-    prompt=prompt,
-    negative_prompt="",
-    sigmas=sigmas,
-).images[0]
-```
+</hfoption>
+</hfoptions>

 <div class="flex gap-4">
  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/karras_sigmas_true.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">Karras sigmas enabled</figcaption>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_lms.png" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">LMSDiscreteScheduler</figcaption>
  </div>
  <div>
-    <img class="rounded-xl" src="https://huggingface.co/datasets/stevhliu/testing-images/resolve/main/karras_sigmas_false.png"/>
-    <figcaption class="mt-2 text-center text-sm text-gray-500">Karras sigmas disabled</figcaption>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_euler_discrete.png" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">EulerDiscreteScheduler</figcaption>
+  </div>
+</div>
+<div class="flex gap-4">
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_euler_ancestral.png" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">EulerAncestralDiscreteScheduler</figcaption>
+  </div>
+  <div>
+    <img class="rounded-xl" src="https://huggingface.co/datasets/patrickvonplaten/images/resolve/main/diffusers_docs/astronaut_dpm.png" />
+    <figcaption class="mt-2 text-center text-sm text-gray-500">DPMSolverMultistepScheduler</figcaption>
  </div>
 </div>

-Refer to the scheduler API [overview](../api/schedulers/overview) for a list of schedulers that support Karras sigmas. It should only be used for models trained with Karras sigmas.
+Most images look very similar and are comparable in quality. Again, it often comes down to your specific use case so a good approach is to run multiple different schedulers and compare the results.

-## Choosing a scheduler
+## Models

-It's important to try different schedulers to find the best one for your use case. Here are a few recommendations to help you get started.
+Models are loaded from the [`ModelMixin.from_pretrained`] method, which downloads and caches the latest version of the model weights and configurations. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache instead of re-downloading them.

- DPM++ 2M SDE Karras is generally a good all-purpose option.
- [`TCDScheduler`] works well for distilled models.
- [`FlowMatchEulerDiscreteScheduler`] and [`FlowMatchHeunDiscreteScheduler`] for FlowMatch models.
- [`EulerDiscreteScheduler`] or [`EulerAncestralDiscreteScheduler`] for generating anime style images.
- DPM++ 2M paired with [`LCMScheduler`] on SDXL for generating realistic images.
+Models can be loaded from a subfolder with the `subfolder` argument. For example, the model weights for [stable-diffusion-v1-5/stable-diffusion-v1-5](https://hf.co/stable-diffusion-v1-5/stable-diffusion-v1-5) are stored in the [unet](https://hf.co/stable-diffusion-v1-5/stable-diffusion-v1-5/tree/main/unet) subfolder.

-## Resources
+```python
+from diffusers import UNet2DConditionModel

- Read the [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) paper for more details about rescaling the noise schedule to enforce zero SNR.
+unet = UNet2DConditionModel.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="unet", use_safetensors=True)
+```
+
+They can also be directly loaded from a [repository](https://huggingface.co/google/ddpm-cifar10-32/tree/main).
+
+```python
+from diffusers import UNet2DModel
+
+unet = UNet2DModel.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
+```
+
+To load and save model variants, specify the `variant` argument in [`ModelMixin.from_pretrained`] and [`ModelMixin.save_pretrained`].
+
+```python
+from diffusers import UNet2DConditionModel
+
+unet = UNet2DConditionModel.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="unet", variant="non_ema", use_safetensors=True
+)
+unet.save_pretrained("./local-unet", variant="non_ema")
+```
+
+Use the `torch_dtype` argument in [`~ModelMixin.from_pretrained`] to specify the dtype to load a model in.
+
+```py
+from diffusers import AutoModel
+
+unet = AutoModel.from_pretrained(
+    "stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet", torch_dtype=torch.float16
+)
+```
+
+You can also use the [torch.Tensor.to](https://docs.pytorch.org/docs/stable/generated/torch.Tensor.to.html) method to convert to the specified dtype on the fly. It converts *all* weights unlike the `torch_dtype` argument that respects the `_keep_in_fp32_modules`. This is important for models whose layers must remain in fp32 for numerical stability and best generation quality (see example [here](https://github.com/huggingface/diffusers/blob/f864a9a352fa4a220d860bfdd1782e3e5af96382/src/diffusers/models/transformers/transformer_wan.py#L374)).
@@ -98,7 +98,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_compute_dtype": torch.bfloat16
    },
-  components_to_quantize="transformer"
+  components_to_quantize=["transformer"]
 )

 pipeline = HunyuanVideoPipeline.from_pretrained(
@@ -48,10 +48,10 @@ t2i_pipeline = t2i_blocks.init_pipeline(modular_repo_id, components_manager=comp
 </hfoption>
 </hfoptions>

-组件仅在调用 [`~ModularPipeline.load_components`] 或 [`~ModularPipeline.load_components`] 时加载和注册。以下示例使用 [`~ModularPipeline.load_components`] 创建第二个管道，重用第一个管道的所有组件，并将其分配到不同的集合。
+组件仅在调用 [`~ModularPipeline.load_components`] 或 [`~ModularPipeline.load_default_components`] 时加载和注册。以下示例使用 [`~ModularPipeline.load_default_components`] 创建第二个管道，重用第一个管道的所有组件，并将其分配到不同的集合。

 ```py
-pipe.load_components()
+pipe.load_default_components()
 pipe2 = ModularPipeline.from_pretrained("YiYiXu/modular-demo-auto", components_manager=comp, collection="test2")
 ```

@@ -185,4 +185,4 @@ comp.enable_auto_cpu_offload(device="cuda")

 所有模型开始时都在 CPU 上，[`ComponentsManager`] 在需要它们之前将它们移动到适当的设备，并在 GPU 内存不足时将其他模型移回 CPU。

-您可以设置自己的规则来决定哪些模型要卸载。
+您可以设置自己的规则来决定哪些模型要卸载。
@@ -73,13 +73,13 @@ ComponentSpec(name='guider', type_hint=<class 'diffusers.guiders.perturbed_atten
 }
 ```

-引导器只有在调用 [`~ModularPipeline.load_components`] 之后才会创建，基于 `modular_model_index.json` 中的加载规范。
+引导器只有在调用 [`~ModularPipeline.load_default_components`] 之后才会创建，基于 `modular_model_index.json` 中的加载规范。

 ```py
 t2i_pipeline = t2i_blocks.init_pipeline("YiYiXu/modular-doc-guider")
 # 在初始化时未创建
 assert t2i_pipeline.guider is None
-t2i_pipeline.load_components()
+t2i_pipeline.load_default_components()
 # 加载为 PAG 引导器
 t2i_pipeline.guider
 ```
@@ -170,4 +170,4 @@ t2i_pipeline.push_to_hub("YiYiXu/modular-doc-guider")
 ```

 </hfoption>
-</hfoptions>
+</hfoptions>
@@ -28,7 +28,7 @@ blocks = SequentialPipelineBlocks.from_blocks_dict(TEXT2IMAGE_BLOCKS)
 modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
 pipeline = blocks.init_pipeline(modular_repo_id)

-pipeline.load_components(torch_dtype=torch.float16)
+pipeline.load_default_components(torch_dtype=torch.float16)
 pipeline.to("cuda")

 image = pipeline(prompt="Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", output="images")[0]
@@ -48,7 +48,7 @@ blocks = SequentialPipelineBlocks.from_blocks_dict(IMAGE2IMAGE_BLOCKS)
 modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
 pipeline = blocks.init_pipeline(modular_repo_id)

-pipeline.load_components(torch_dtype=torch.float16)
+pipeline.load_default_components(torch_dtype=torch.float16)
 pipeline.to("cuda")

 url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-text2img.png"
@@ -72,7 +72,7 @@ blocks = SequentialPipelineBlocks.from_blocks_dict(INPAINT_BLOCKS)
 modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
 pipeline = blocks.init_pipeline(modular_repo_id)

-pipeline.load_components(torch_dtype=torch.float16)
+pipeline.load_default_components(torch_dtype=torch.float16)
 pipeline.to("cuda")

 img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-text2img.png"
@@ -176,15 +176,15 @@ diffdiff_pipeline = ModularPipeline.from_pretrained(modular_repo_id, trust_remot

 ## 加载组件

-一个[`ModularPipeline`]不会自动实例化组件。它只加载配置和组件规范。您可以使用[`~ModularPipeline.load_components`]加载所有组件，或仅使用[`~ModularPipeline.load_components`]加载特定组件。
+一个[`ModularPipeline`]不会自动实例化组件。它只加载配置和组件规范。您可以使用[`~ModularPipeline.load_default_components`]加载所有组件，或仅使用[`~ModularPipeline.load_components`]加载特定组件。

 <hfoptions id="load">
-<hfoption id="load_components">
+<hfoption id="load_default_components">

 ```py
 import torch

-t2i_pipeline.load_components(torch_dtype=torch.float16)
+t2i_pipeline.load_default_components(torch_dtype=torch.float16)
 t2i_pipeline.to("cuda")
 ```

@@ -175,7 +175,7 @@ print(dd_blocks)
 将 [`SequentialPipelineBlocks`] 转换为 [`ModularPipeline`]，使用 [`ModularPipeline.init_pipeline`] 方法。这会初始化从 `modular_model_index.json` 文件加载的预期组件。通过调用 [`ModularPipeline.load_defau
 lt_components`]。

-初始化[`ComponentManager`]时传入pipeline是一个好主意，以帮助管理不同的组件。一旦调用[`~ModularPipeline.load_components`]，组件就会被注册到[`ComponentManager`]中，并且可以在工作流之间共享。下面的例子使用`collection`参数为组件分配了一个`"diffdiff"`标签，以便更好地组织。
+初始化[`ComponentManager`]时传入pipeline是一个好主意，以帮助管理不同的组件。一旦调用[`~ModularPipeline.load_default_components`]，组件就会被注册到[`ComponentManager`]中，并且可以在工作流之间共享。下面的例子使用`collection`参数为组件分配了一个`"diffdiff"`标签，以便更好地组织。

 ```py
 from diffusers.modular_pipelines import ComponentsManager
@@ -209,11 +209,11 @@ ip_adapter_block = StableDiffusionXLAutoIPAdapterStep()
 dd_blocks.sub_blocks.insert("ip_adapter", ip_adapter_block, 0)
 ```

-调用[`~ModularPipeline.init_pipeline`]来初始化一个[`ModularPipeline`]，并使用[`~ModularPipeline.load_components`]加载模型组件。加载并设置IP-Adapter以运行pipeline。
+调用[`~ModularPipeline.init_pipeline`]来初始化一个[`ModularPipeline`]，并使用[`~ModularPipeline.load_default_components`]加载模型组件。加载并设置IP-Adapter以运行pipeline。

 ```py
 dd_pipeline = dd_blocks.init_pipeline("YiYiXu/modular-demo-auto", collection="diffdiff")
-dd_pipeline.load_components(torch_dtype=torch.float16)
+dd_pipeline.load_default_components(torch_dtype=torch.float16)
 dd_pipeline.loader.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
 dd_pipeline.loader.set_ip_adapter_scale(0.6)
 dd_pipeline = dd_pipeline.to(device)
@@ -261,14 +261,14 @@ class SDXLDiffDiffControlNetDenoiseStep(StableDiffusionXLDenoiseLoopWrapper):
 controlnet_denoise_block = SDXLDiffDiffControlNetDenoiseStep()
 ```

-插入 `controlnet_input` 块并用新的 `controlnet_denoise_block` 替换 `denoise` 块。初始化一个 [`ModularPipeline`] 并将 [`~ModularPipeline.load_components`] 加载到其中。
+插入 `controlnet_input` 块并用新的 `controlnet_denoise_block` 替换 `denoise` 块。初始化一个 [`ModularPipeline`] 并将 [`~ModularPipeline.load_default_components`] 加载到其中。

 ```py
 dd_blocks.sub_blocks.insert("controlnet_input", control_input_block, 7)
 dd_blocks.sub_blocks["denoise"] = controlnet_denoise_block

 dd_pipeline = dd_blocks.init_pipeline("YiYiXu/modular-demo-auto", collection="diffdiff")
-dd_pipeline.load_components(torch_dtype=torch.float16)
+dd_pipeline.load_default_components(torch_dtype=torch.float16)
 dd_pipeline = dd_pipeline.to(device)

 control_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/diffdiff_tomato_canny.jpeg")
@@ -322,7 +322,7 @@ DIFFDIFF_AUTO_BLOCKS.insert("controlnet_input",StableDiffusionXLControlNetAutoIn
 ```py
 dd_auto_blocks = SequentialPipelineBlocks.from_blocks_dict(DIFFDIFF_AUTO_BLOCKS)
 dd_pipeline = dd_auto_blocks.init_pipeline("YiYiXu/modular-demo-auto", collection="diffdiff")
-dd_pipeline.load_components(torch_dtype=torch.float16)
+dd_pipeline.load_default_components(torch_dtype=torch.float16)
 ```

 ## 分享
@@ -342,5 +342,5 @@ from diffusers.modular_pipelines import ModularPipeline, ComponentsManager
 components = ComponentsManager()

 diffdiff_pipeline = ModularPipeline.from_pretrained("YiYiXu/modular-diffdiff-0704", trust_remote_code=True, components_manager=components, collection="diffdiff")
-diffdiff_pipeline.load_components(torch_dtype=torch.float16)
+diffdiff_pipeline.load_default_components(torch_dtype=torch.float16)
 ```
@@ -223,7 +223,7 @@ from diffusers.image_processor import VaeImageProcessor
 import torch 

 vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=torch.bfloat16).to("cuda")
-vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
+vae_scale_factor = 2 ** (len(vae.config.block_out_channels))
 image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)

 with torch.no_grad():
@@ -1399,7 +1399,6 @@ def main(args):
                torch_dtype = torch.float16
            elif args.prior_generation_precision == "bf16":
                torch_dtype = torch.bfloat16
-
            pipeline = FluxPipeline.from_pretrained(
                args.pretrained_model_name_or_path,
                torch_dtype=torch_dtype,
@@ -1420,8 +1419,7 @@ def main(args):
            for example in tqdm(
                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
            ):
-                with torch.autocast(device_type=accelerator.device.type, dtype=torch_dtype):
-                    images = pipeline(prompt=example["prompt"]).images
+                images = pipeline(example["prompt"]).images

                for i, image in enumerate(images):
                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
@@ -1705,12 +1705,6 @@ class FaithDiffStableDiffusionXLPipeline(
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.
        """
-        depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
-        deprecate(
-            "enable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.enable_tiling()
        self.unet.denoise_encoder.enable_tiling()

@@ -1719,12 +1713,6 @@ class FaithDiffStableDiffusionXLPipeline(
        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
        computing decoding in one step.
        """
-        depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
-        deprecate(
-            "disable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.disable_tiling()
        self.unet.denoise_encoder.disable_tiling()

@@ -35,7 +35,6 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from diffusers.utils import (
    USE_PEFT_BACKEND,
-    deprecate,
    is_torch_xla_available,
    logging,
    replace_example_docstring,
@@ -644,12 +643,6 @@ class FluxKontextPipeline(
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.
        """
-        depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
-        deprecate(
-            "enable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.enable_tiling()

    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.disable_vae_tiling
@@ -658,12 +651,6 @@ class FluxKontextPipeline(
        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
        computing decoding in one step.
        """
-        depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
-        deprecate(
-            "disable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.disable_tiling()

    def preprocess_image(self, image: PipelineImageInput, _auto_resize: bool, multiple_of: int) -> torch.Tensor:
@@ -30,7 +30,6 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from diffusers.utils import (
    USE_PEFT_BACKEND,
-    deprecate,
    is_torch_xla_available,
    logging,
    replace_example_docstring,
@@ -527,12 +526,6 @@ class RFInversionFluxPipeline(
        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
        """
-        depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
-        deprecate(
-            "enable_vae_slicing",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.enable_slicing()

    def disable_vae_slicing(self):
@@ -540,12 +533,6 @@ class RFInversionFluxPipeline(
        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
        computing decoding in one step.
        """
-        depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
-        deprecate(
-            "disable_vae_slicing",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.disable_slicing()

    def enable_vae_tiling(self):
@@ -554,12 +541,6 @@ class RFInversionFluxPipeline(
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.
        """
-        depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
-        deprecate(
-            "enable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.enable_tiling()

    def disable_vae_tiling(self):
@@ -567,12 +548,6 @@ class RFInversionFluxPipeline(
        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
        computing decoding in one step.
        """
-        depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
-        deprecate(
-            "disable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.disable_tiling()

    def prepare_latents_inversion(
@@ -35,7 +35,6 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from diffusers.utils import (
    USE_PEFT_BACKEND,
-    deprecate,
    is_torch_xla_available,
    logging,
    replace_example_docstring,
@@ -703,12 +702,6 @@ class FluxSemanticGuidancePipeline(
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.
        """
-        depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
-        deprecate(
-            "enable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.enable_tiling()

    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.disable_vae_tiling
@@ -717,12 +710,6 @@ class FluxSemanticGuidancePipeline(
        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
        computing decoding in one step.
        """
-        depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
-        deprecate(
-            "disable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.disable_tiling()

    # Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_latents
@@ -28,7 +28,6 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
 from diffusers.utils import (
    USE_PEFT_BACKEND,
-    deprecate,
    is_torch_xla_available,
    logging,
    replace_example_docstring,
@@ -504,12 +503,6 @@ class FluxCFGPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixi
        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
        """
-        depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
-        deprecate(
-            "enable_vae_slicing",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.enable_slicing()

    def disable_vae_slicing(self):
@@ -517,12 +510,6 @@ class FluxCFGPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixi
        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
        computing decoding in one step.
        """
-        depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
-        deprecate(
-            "disable_vae_slicing",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.disable_slicing()

    def enable_vae_tiling(self):
@@ -531,12 +518,6 @@ class FluxCFGPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixi
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.
        """
-        depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
-        deprecate(
-            "enable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.enable_tiling()

    def disable_vae_tiling(self):
@@ -544,12 +525,6 @@ class FluxCFGPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixi
        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
        computing decoding in one step.
        """
-        depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
-        deprecate(
-            "disable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.disable_tiling()

    def prepare_latents(
@@ -29,7 +29,11 @@ from diffusers.models.transformers import SD3Transformer2DModel
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.pipelines.stable_diffusion_3.pipeline_output import StableDiffusion3PipelineOutput
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
-from diffusers.utils import is_torch_xla_available, logging, replace_example_docstring
+from diffusers.utils import (
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
 from diffusers.utils.torch_utils import randn_tensor


@@ -504,12 +504,6 @@ class StableDiffusionBoxDiffPipeline(
        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
        """
-        depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
-        deprecate(
-            "enable_vae_slicing",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.enable_slicing()

    def disable_vae_slicing(self):
@@ -517,12 +511,6 @@ class StableDiffusionBoxDiffPipeline(
        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
        computing decoding in one step.
        """
-        depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
-        deprecate(
-            "disable_vae_slicing",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.disable_slicing()

    def enable_vae_tiling(self):
@@ -531,12 +519,6 @@ class StableDiffusionBoxDiffPipeline(
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.
        """
-        depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
-        deprecate(
-            "enable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.enable_tiling()

    def disable_vae_tiling(self):
@@ -544,12 +526,6 @@ class StableDiffusionBoxDiffPipeline(
        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
        computing decoding in one step.
        """
-        depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
-        deprecate(
-            "disable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.disable_tiling()

    def _encode_prompt(
@@ -471,12 +471,6 @@ class StableDiffusionPAGPipeline(
        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
        """
-        depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
-        deprecate(
-            "enable_vae_slicing",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.enable_slicing()

    def disable_vae_slicing(self):
@@ -484,12 +478,6 @@ class StableDiffusionPAGPipeline(
        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
        computing decoding in one step.
        """
-        depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
-        deprecate(
-            "disable_vae_slicing",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.disable_slicing()

    def enable_vae_tiling(self):
@@ -498,12 +486,6 @@ class StableDiffusionPAGPipeline(
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.
        """
-        depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
-        deprecate(
-            "enable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.enable_tiling()

    def disable_vae_tiling(self):
@@ -511,12 +493,6 @@ class StableDiffusionPAGPipeline(
        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
        computing decoding in one step.
        """
-        depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
-        deprecate(
-            "disable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.disable_tiling()

    def _encode_prompt(
@@ -26,7 +26,7 @@ from diffusers.models import AutoencoderKLHunyuanVideo, HunyuanVideoTransformer3
 from diffusers.pipelines.hunyuan_video.pipeline_output import HunyuanVideoPipelineOutput
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
-from diffusers.utils import deprecate, is_torch_xla_available, logging, replace_example_docstring
+from diffusers.utils import is_torch_xla_available, logging, replace_example_docstring
 from diffusers.utils.torch_utils import randn_tensor
 from diffusers.video_processor import VideoProcessor

@@ -481,12 +481,6 @@ class HunyuanVideoSTGPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMixin):
        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
        """
-        depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
-        deprecate(
-            "enable_vae_slicing",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.enable_slicing()

    def disable_vae_slicing(self):
@@ -494,12 +488,6 @@ class HunyuanVideoSTGPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMixin):
        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
        computing decoding in one step.
        """
-        depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
-        deprecate(
-            "disable_vae_slicing",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.disable_slicing()

    def enable_vae_tiling(self):
@@ -508,12 +496,6 @@ class HunyuanVideoSTGPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMixin):
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.
        """
-        depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
-        deprecate(
-            "enable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.enable_tiling()

    def disable_vae_tiling(self):
@@ -521,12 +503,6 @@ class HunyuanVideoSTGPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMixin):
        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
        computing decoding in one step.
        """
-        depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
-        deprecate(
-            "disable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.disable_tiling()

    @property
@@ -26,7 +26,11 @@ from diffusers.models import AutoencoderKLMochi, MochiTransformer3DModel
 from diffusers.pipelines.mochi.pipeline_output import MochiPipelineOutput
 from diffusers.pipelines.pipeline_utils import DiffusionPipeline
 from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
-from diffusers.utils import deprecate, is_torch_xla_available, logging, replace_example_docstring
+from diffusers.utils import (
+    is_torch_xla_available,
+    logging,
+    replace_example_docstring,
+)
 from diffusers.utils.torch_utils import randn_tensor
 from diffusers.video_processor import VideoProcessor

@@ -454,12 +458,6 @@ class MochiSTGPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin):
        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
        """
-        depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
-        deprecate(
-            "enable_vae_slicing",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.enable_slicing()

    def disable_vae_slicing(self):
@@ -467,12 +465,6 @@ class MochiSTGPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin):
        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
        computing decoding in one step.
        """
-        depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
-        deprecate(
-            "disable_vae_slicing",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.disable_slicing()

    def enable_vae_tiling(self):
@@ -481,12 +473,6 @@ class MochiSTGPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin):
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.
        """
-        depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
-        deprecate(
-            "enable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.enable_tiling()

    def disable_vae_tiling(self):
@@ -494,12 +480,6 @@ class MochiSTGPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin):
        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
        computing decoding in one step.
        """
-        depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
-        deprecate(
-            "disable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.disable_tiling()

    def prepare_latents(
@@ -1131,7 +1131,6 @@ def main(args):
                torch_dtype = torch.float16
            elif args.prior_generation_precision == "bf16":
                torch_dtype = torch.bfloat16
-
            pipeline = FluxPipeline.from_pretrained(
                args.pretrained_model_name_or_path,
                torch_dtype=torch_dtype,
@@ -1152,8 +1151,7 @@ def main(args):
            for example in tqdm(
                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
            ):
-                with torch.autocast(device_type=accelerator.device.type, dtype=torch_dtype):
-                    images = pipeline(prompt=example["prompt"]).images
+                images = pipeline(example["prompt"]).images

                for i, image in enumerate(images):
                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
@@ -1161,7 +1159,8 @@ def main(args):
                    image.save(image_filename)

            del pipeline
-            free_memory()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()

    # Handle the repository creation
    if accelerator.is_main_process:
@@ -1729,10 +1728,6 @@ def main(args):
                            device=accelerator.device,
                            prompt=args.instance_prompt,
                        )
-                    else:
-                        prompt_embeds, pooled_prompt_embeds, text_ids = compute_text_embeddings(
-                            prompts, text_encoders, tokenizers
-                        )

                # Convert images to latent space
                if args.cache_latents:
@@ -29,9 +29,8 @@ from pathlib import Path
 import numpy as np
 import torch
 import transformers
-from accelerate import Accelerator, DistributedType
+from accelerate import Accelerator
 from accelerate.logging import get_logger
-from accelerate.state import AcceleratorState
 from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
 from huggingface_hub import create_repo, upload_folder
 from huggingface_hub.utils import insecure_hashlib
@@ -1223,9 +1222,6 @@ def main(args):
        kwargs_handlers=[kwargs],
    )

-    if accelerator.distributed_type == DistributedType.DEEPSPEED:
-        AcceleratorState().deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = args.train_batch_size
-
    # Disable AMP for MPS.
    if torch.backends.mps.is_available():
        accelerator.native_amp = False
@@ -1274,7 +1270,6 @@ def main(args):
                subfolder="transformer",
                revision=args.revision,
                variant=args.variant,
-                torch_dtype=torch_dtype,
            )
            pipeline = FluxKontextPipeline.from_pretrained(
                args.pretrained_model_name_or_path,
@@ -1297,8 +1292,7 @@ def main(args):
            for example in tqdm(
                sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
            ):
-                with torch.autocast(device_type=accelerator.device.type, dtype=torch_dtype):
-                    images = pipeline(prompt=example["prompt"]).images
+                images = pipeline(example["prompt"]).images

                for i, image in enumerate(images):
                    hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
@@ -1442,20 +1436,17 @@ def main(args):
            text_encoder_one_lora_layers_to_save = None
            modules_to_save = {}
            for model in models:
-                if isinstance(unwrap_model(model), type(unwrap_model(transformer))):
-                    model = unwrap_model(model)
+                if isinstance(model, type(unwrap_model(transformer))):
                    transformer_lora_layers_to_save = get_peft_model_state_dict(model)
                    modules_to_save["transformer"] = model
-                elif isinstance(unwrap_model(model), type(unwrap_model(text_encoder_one))):
-                    model = unwrap_model(model)
+                elif isinstance(model, type(unwrap_model(text_encoder_one))):
                    text_encoder_one_lora_layers_to_save = get_peft_model_state_dict(model)
                    modules_to_save["text_encoder"] = model
                else:
                    raise ValueError(f"unexpected save model: {model.__class__}")

                # make sure to pop weight so that corresponding model is not saved again
-                if weights:
-                    weights.pop()
+                weights.pop()

            FluxKontextPipeline.save_lora_weights(
                output_dir,
@@ -1468,25 +1459,15 @@ def main(args):
        transformer_ = None
        text_encoder_one_ = None

-        if not accelerator.distributed_type == DistributedType.DEEPSPEED:
-            while len(models) > 0:
-                model = models.pop()
+        while len(models) > 0:
+            model = models.pop()

-                if isinstance(unwrap_model(model), type(unwrap_model(transformer))):
-                    transformer_ = unwrap_model(model)
-                elif isinstance(unwrap_model(model), type(unwrap_model(text_encoder_one))):
-                    text_encoder_one_ = unwrap_model(model)
-                else:
-                    raise ValueError(f"unexpected save model: {model.__class__}")
-
-        else:
-            transformer_ = FluxTransformer2DModel.from_pretrained(
-                args.pretrained_model_name_or_path, subfolder="transformer"
-            )
-            transformer_.add_adapter(transformer_lora_config)
-            text_encoder_one_ = text_encoder_cls_one.from_pretrained(
-                args.pretrained_model_name_or_path, subfolder="text_encoder"
-            )
+            if isinstance(model, type(unwrap_model(transformer))):
+                transformer_ = model
+            elif isinstance(model, type(unwrap_model(text_encoder_one))):
+                text_encoder_one_ = model
+            else:
+                raise ValueError(f"unexpected save model: {model.__class__}")

        lora_state_dict = FluxKontextPipeline.lora_state_dict(input_dir)

@@ -1918,10 +1899,6 @@ def main(args):
                            device=accelerator.device,
                            prompt=args.instance_prompt,
                        )
-                    else:
-                        prompt_embeds, pooled_prompt_embeds, text_ids = compute_text_embeddings(
-                            prompts, text_encoders, tokenizers
-                        )

                # Convert images to latent space
                if args.cache_latents:
@@ -2086,7 +2063,7 @@ def main(args):
                progress_bar.update(1)
                global_step += 1

-                if accelerator.is_main_process or accelerator.distributed_type == DistributedType.DEEPSPEED:
+                if accelerator.is_main_process:
                    if global_step % args.checkpointing_steps == 0:
                        # _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
                        if args.checkpoints_total_limit is not None:
@@ -1760,7 +1760,7 @@
    "clip_local = None\n",
    "clip_pos = None\n",
    "\n",
-    "# constants for data handling\n",
+    "# constands for data handling\n",
    "save_traj = False\n",
    "save_data = False\n",
    "output_dir = \"/content/\""
@@ -2,7 +2,7 @@

 Please note that this project is not actively maintained. However, you can open an issue and tag @gzguevara.

-[DreamBooth](https://huggingface.co/papers/2208.12242) is a method to personalize text2image models like stable diffusion given just a few(3~5) images of a subject. This project consists of **two parts**. Training Stable Diffusion for inpainting requires prompt-image-mask pairs. The Unet of inpainiting models have 5 additional input channels (4 for the encoded masked-image and 1 for the mask itself).
+[DreamBooth](https://huggingface.co/papers/2208.12242) is a method to personalize text2image models like stable diffusion given just a few(3~5) images of a subject. This project consists of **two parts**. Training Stable Diffusion for inpainting requieres prompt-image-mask pairs. The Unet of inpainiting models have 5 additional input channels (4 for the encoded masked-image and 1 for the mask itself).

 **The first part**, the `multi_inpaint_dataset.ipynb` notebook, demonstrates how make a 🤗 dataset of prompt-image-mask pairs. You can, however, skip the first part and move straight to the second part with the example datasets in this project. ([cat toy dataset masked](https://huggingface.co/datasets/gzguevara/cat_toy_masked), [mr. potato head dataset masked](https://huggingface.co/datasets/gzguevara/mr_potato_head_masked))

@@ -263,12 +263,6 @@ class PromptDiffusionPipeline(
        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
        processing larger images.
        """
-        depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
-        deprecate(
-            "enable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.enable_tiling()

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
@@ -277,12 +271,6 @@ class PromptDiffusionPipeline(
        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
        computing decoding in one step.
        """
-        depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
-        deprecate(
-            "disable_vae_tiling",
-            "0.40.0",
-            depr_message,
-        )
        self.vae.disable_tiling()

    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
@@ -1,91 +0,0 @@
-import logging
-import os
-from dataclasses import dataclass, field
-from typing import List
-
-import torch
-from pydantic import BaseModel
-
-from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3 import StableDiffusion3Pipeline
-
-
-logger = logging.getLogger(__name__)
-
-
-class TextToImageInput(BaseModel):
-    model: str
-    prompt: str
-    size: str | None = None
-    n: int | None = None
-
-
-@dataclass
-class PresetModels:
-    SD3: List[str] = field(default_factory=lambda: ["stabilityai/stable-diffusion-3-medium"])
-    SD3_5: List[str] = field(
-        default_factory=lambda: [
-            "stabilityai/stable-diffusion-3.5-large",
-            "stabilityai/stable-diffusion-3.5-large-turbo",
-            "stabilityai/stable-diffusion-3.5-medium",
-        ]
-    )
-
-
-class TextToImagePipelineSD3:
-    def __init__(self, model_path: str | None = None):
-        self.model_path = model_path or os.getenv("MODEL_PATH")
-        self.pipeline: StableDiffusion3Pipeline | None = None
-        self.device: str | None = None
-
-    def start(self):
-        if torch.cuda.is_available():
-            model_path = self.model_path or "stabilityai/stable-diffusion-3.5-large"
-            logger.info("Loading CUDA")
-            self.device = "cuda"
-            self.pipeline = StableDiffusion3Pipeline.from_pretrained(
-                model_path,
-                torch_dtype=torch.float16,
-            ).to(device=self.device)
-        elif torch.backends.mps.is_available():
-            model_path = self.model_path or "stabilityai/stable-diffusion-3.5-medium"
-            logger.info("Loading MPS for Mac M Series")
-            self.device = "mps"
-            self.pipeline = StableDiffusion3Pipeline.from_pretrained(
-                model_path,
-                torch_dtype=torch.bfloat16,
-            ).to(device=self.device)
-        else:
-            raise Exception("No CUDA or MPS device available")
-
-
-class ModelPipelineInitializer:
-    def __init__(self, model: str = "", type_models: str = "t2im"):
-        self.model = model
-        self.type_models = type_models
-        self.pipeline = None
-        self.device = "cuda" if torch.cuda.is_available() else "mps"
-        self.model_type = None
-
-    def initialize_pipeline(self):
-        if not self.model:
-            raise ValueError("Model name not provided")
-
-        # Check if model exists in PresetModels
-        preset_models = PresetModels()
-
-        # Determine which model type we're dealing with
-        if self.model in preset_models.SD3:
-            self.model_type = "SD3"
-        elif self.model in preset_models.SD3_5:
-            self.model_type = "SD3_5"
-
-        # Create appropriate pipeline based on model type and type_models
-        if self.type_models == "t2im":
-            if self.model_type in ["SD3", "SD3_5"]:
-                self.pipeline = TextToImagePipelineSD3(self.model)
-            else:
-                raise ValueError(f"Model type {self.model_type} not supported for text-to-image")
-        elif self.type_models == "t2v":
-            raise ValueError(f"Unsupported type_models: {self.type_models}")
-
-        return self.pipeline
@@ -1,171 +0,0 @@
-# Asynchronous server and parallel execution of models
-
-> Example/demo server that keeps a single model in memory while safely running parallel inference requests by creating per-request lightweight views and cloning only small, stateful components (schedulers, RNG state, small mutable attrs). Works with StableDiffusion3 pipelines.
-> We recommend running 10 to 50 inferences in parallel for optimal performance, averaging between 25 and 30 seconds to 1 minute and 1 minute and 30 seconds. (This is only recommended if you have a GPU with 35GB of VRAM or more; otherwise, keep it to one or two inferences in parallel to avoid decoding or saving errors due to memory shortages.)
-
-## ⚠️ IMPORTANT
-
-* The example demonstrates how to run pipelines like `StableDiffusion3-3.5` concurrently while keeping a single copy of the heavy model parameters on GPU.
-
-## Necessary components
-
-All the components needed to create the inference server are in the current directory:
-
-```
-server-async/
-├── utils/
-├─────── __init__.py
-├─────── scheduler.py              # BaseAsyncScheduler wrapper and async_retrieve_timesteps for secure inferences
-├─────── requestscopedpipeline.py  # RequestScoped Pipeline for inference with a single in-memory model
-├─────── utils.py                  # Image/video saving utilities and service configuration
-├── Pipelines.py                   # pipeline loader classes (SD3)
-├── serverasync.py                 # FastAPI app with lifespan management and async inference endpoints
-├── test.py                        # Client test script for inference requests
-├── requirements.txt               # Dependencies
-└── README.md                      # This documentation
-```
-
-## What `diffusers-async` adds / Why we needed it
-
-Core problem: a naive server that calls `pipe.__call__` concurrently can hit **race conditions** (e.g., `scheduler.set_timesteps` mutates shared state) or explode memory by deep-copying the whole pipeline per-request.
-
-`diffusers-async` / this example addresses that by:
-
-* **Request-scoped views**: `RequestScopedPipeline` creates a shallow copy of the pipeline per request so heavy weights (UNet, VAE, text encoder) remain shared and *are not duplicated*.
-* **Per-request mutable state**: stateful small objects (scheduler, RNG state, small lists/dicts, callbacks) are cloned per request. The system uses `BaseAsyncScheduler.clone_for_request(...)` for scheduler cloning, with fallback to safe `deepcopy` or other heuristics.
-* **Tokenizer concurrency safety**: `RequestScopedPipeline` now manages an internal tokenizer lock with automatic tokenizer detection and wrapping. This ensures that Rust tokenizers are safe to use under concurrency — race condition errors like `Already borrowed` no longer occur.
-* **`async_retrieve_timesteps(..., return_scheduler=True)`**: fully retro-compatible helper that returns `(timesteps, num_inference_steps, scheduler)` without mutating the shared scheduler. For users not using `return_scheduler=True`, the behavior is identical to the original API.
-* **Robust attribute handling**: wrapper avoids writing to read-only properties (e.g., `components`) and auto-detects small mutable attributes to clone while avoiding duplication of large tensors. Configurable tensor size threshold prevents cloning of large tensors.
-* **Enhanced scheduler wrapping**: `BaseAsyncScheduler` automatically wraps schedulers with improved `__getattr__`, `__setattr__`, and debugging methods (`__repr__`, `__str__`).
-
-## How the server works (high-level flow)
-
-1. **Single model instance** is loaded into memory (GPU/MPS) when the server starts.
-2. On each HTTP inference request:
-
-   * The server uses `RequestScopedPipeline.generate(...)` which:
-
-     * automatically wraps the base scheduler in `BaseAsyncScheduler` (if not already wrapped),
-     * obtains a *local scheduler* (via `clone_for_request()` or `deepcopy`),
-     * does `local_pipe = copy.copy(base_pipe)` (shallow copy),
-     * sets `local_pipe.scheduler = local_scheduler` (if possible),
-     * clones only small mutable attributes (callbacks, rng, small latents) with auto-detection,
-     * wraps tokenizers with thread-safe locks to prevent race conditions,
-     * optionally enters a `model_cpu_offload_context()` for memory offload hooks,
-     * calls the pipeline on the local view (`local_pipe(...)`).
-3. **Result**: inference completes, images are moved to CPU & saved (if requested), internal buffers freed (GC + `torch.cuda.empty_cache()`).
-4. Multiple requests can run in parallel while sharing heavy weights and isolating mutable state.
-
-## How to set up and run the server
-
-### 1) Install dependencies
-
-Recommended: create a virtualenv / conda environment.
-
-```bash
-pip install diffusers
-pip install -r requirements.txt
-```
-
-### 2) Start the server
-
-Using the `serverasync.py` file that already has everything you need:
-
-```bash
-python serverasync.py
-```
-
-The server will start on `http://localhost:8500` by default with the following features:
- FastAPI application with async lifespan management
- Automatic model loading and pipeline initialization
- Request counting and active inference tracking
- Memory cleanup after each inference
- CORS middleware for cross-origin requests
-
-### 3) Test the server
-
-Use the included test script:
-
-```bash
-python test.py
-```
-
-Or send a manual request:
-
-`POST /api/diffusers/inference` with JSON body:
-
-```json
-{
-  "prompt": "A futuristic cityscape, vibrant colors",
-  "num_inference_steps": 30,
-  "num_images_per_prompt": 1
-}
-```
-
-Response example:
-
-```json
-{
-  "response": ["http://localhost:8500/images/img123.png"]
-}
-```
-
-### 4) Server endpoints
-
- `GET /` - Welcome message
- `POST /api/diffusers/inference` - Main inference endpoint
- `GET /images/{filename}` - Serve generated images
- `GET /api/status` - Server status and memory info
-
-## Advanced Configuration
-
-### RequestScopedPipeline Parameters
-
-```python
-RequestScopedPipeline(
-    pipeline,                        # Base pipeline to wrap
-    mutable_attrs=None,             # Custom list of attributes to clone
-    auto_detect_mutables=True,      # Enable automatic detection of mutable attributes
-    tensor_numel_threshold=1_000_000, # Tensor size threshold for cloning
-    tokenizer_lock=None,            # Custom threading lock for tokenizers
-    wrap_scheduler=True             # Auto-wrap scheduler in BaseAsyncScheduler
-)
-```
-
-### BaseAsyncScheduler Features
-
-* Transparent proxy to the original scheduler with `__getattr__` and `__setattr__`
-* `clone_for_request()` method for safe per-request scheduler cloning
-* Enhanced debugging with `__repr__` and `__str__` methods
-* Full compatibility with existing scheduler APIs
-
-### Server Configuration
-
-The server configuration can be modified in `serverasync.py` through the `ServerConfigModels` dataclass:
-
-```python
-@dataclass
-class ServerConfigModels:
-    model: str = 'stabilityai/stable-diffusion-3.5-medium'  
-    type_models: str = 't2im'  
-    host: str = '0.0.0.0' 
-    port: int = 8500
-```
-
-## Troubleshooting (quick)
-
-* `Already borrowed` — previously a Rust tokenizer concurrency error.
-  ✅ This is now fixed: `RequestScopedPipeline` automatically detects and wraps tokenizers with thread locks, so race conditions no longer happen.
-
-* `can't set attribute 'components'` — pipeline exposes read-only `components`.
-  ✅ The RequestScopedPipeline now detects read-only properties and skips setting them automatically.
-
-* Scheduler issues:
-  * If the scheduler doesn't implement `clone_for_request` and `deepcopy` fails, we log and fallback — but prefer `async_retrieve_timesteps(..., return_scheduler=True)` to avoid mutating the shared scheduler.
-  ✅ Note: `async_retrieve_timesteps` is fully retro-compatible — if you don't pass `return_scheduler=True`, the behavior is unchanged.
-
-* Memory issues with large tensors:
-  ✅ The system now has configurable `tensor_numel_threshold` to prevent cloning of large tensors while still cloning small mutable ones.
-
-* Automatic tokenizer detection:
-  ✅ The system automatically identifies tokenizer components by checking for tokenizer methods, class names, and attributes, then applies thread-safe wrappers.
@@ -1,10 +0,0 @@
-torch 
-torchvision 
-transformers 
-sentencepiece 
-fastapi 
-uvicorn 
-ftfy
-accelerate
-xformers
-protobuf
@@ -1,230 +0,0 @@
-import asyncio
-import gc
-import logging
-import os
-import random
-import threading
-from contextlib import asynccontextmanager
-from dataclasses import dataclass
-from typing import Any, Dict, Optional, Type
-
-import torch
-from fastapi import FastAPI, HTTPException, Request
-from fastapi.concurrency import run_in_threadpool
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import FileResponse
-from Pipelines import ModelPipelineInitializer
-from pydantic import BaseModel
-
-from utils import RequestScopedPipeline, Utils
-
-
-@dataclass
-class ServerConfigModels:
-    model: str = "stabilityai/stable-diffusion-3.5-medium"
-    type_models: str = "t2im"
-    constructor_pipeline: Optional[Type] = None
-    custom_pipeline: Optional[Type] = None
-    components: Optional[Dict[str, Any]] = None
-    torch_dtype: Optional[torch.dtype] = None
-    host: str = "0.0.0.0"
-    port: int = 8500
-
-
-server_config = ServerConfigModels()
-
-
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    logging.basicConfig(level=logging.INFO)
-    app.state.logger = logging.getLogger("diffusers-server")
-    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128,expandable_segments:True"
-    os.environ["CUDA_LAUNCH_BLOCKING"] = "0"
-
-    app.state.total_requests = 0
-    app.state.active_inferences = 0
-    app.state.metrics_lock = asyncio.Lock()
-    app.state.metrics_task = None
-
-    app.state.utils_app = Utils(
-        host=server_config.host,
-        port=server_config.port,
-    )
-
-    async def metrics_loop():
-        try:
-            while True:
-                async with app.state.metrics_lock:
-                    total = app.state.total_requests
-                    active = app.state.active_inferences
-                app.state.logger.info(f"[METRICS] total_requests={total} active_inferences={active}")
-                await asyncio.sleep(5)
-        except asyncio.CancelledError:
-            app.state.logger.info("Metrics loop cancelled")
-            raise
-
-    app.state.metrics_task = asyncio.create_task(metrics_loop())
-
-    try:
-        yield
-    finally:
-        task = app.state.metrics_task
-        if task:
-            task.cancel()
-            try:
-                await task
-            except asyncio.CancelledError:
-                pass
-
-        try:
-            stop_fn = getattr(model_pipeline, "stop", None) or getattr(model_pipeline, "close", None)
-            if callable(stop_fn):
-                await run_in_threadpool(stop_fn)
-        except Exception as e:
-            app.state.logger.warning(f"Error during pipeline shutdown: {e}")
-
-        app.state.logger.info("Lifespan shutdown complete")
-
-
-app = FastAPI(lifespan=lifespan)
-
-logger = logging.getLogger("DiffusersServer.Pipelines")
-
-
-initializer = ModelPipelineInitializer(
-    model=server_config.model,
-    type_models=server_config.type_models,
-)
-model_pipeline = initializer.initialize_pipeline()
-model_pipeline.start()
-
-request_pipe = RequestScopedPipeline(model_pipeline.pipeline)
-pipeline_lock = threading.Lock()
-
-logger.info(f"Pipeline initialized and ready to receive requests (model ={server_config.model})")
-
-app.state.MODEL_INITIALIZER = initializer
-app.state.MODEL_PIPELINE = model_pipeline
-app.state.REQUEST_PIPE = request_pipe
-app.state.PIPELINE_LOCK = pipeline_lock
-
-
-class JSONBodyQueryAPI(BaseModel):
-    model: str | None = None
-    prompt: str
-    negative_prompt: str | None = None
-    num_inference_steps: int = 28
-    num_images_per_prompt: int = 1
-
-
-@app.middleware("http")
-async def count_requests_middleware(request: Request, call_next):
-    async with app.state.metrics_lock:
-        app.state.total_requests += 1
-    response = await call_next(request)
-    return response
-
-
-@app.get("/")
-async def root():
-    return {"message": "Welcome to the Diffusers Server"}
-
-
-@app.post("/api/diffusers/inference")
-async def api(json: JSONBodyQueryAPI):
-    prompt = json.prompt
-    negative_prompt = json.negative_prompt or ""
-    num_steps = json.num_inference_steps
-    num_images_per_prompt = json.num_images_per_prompt
-
-    wrapper = app.state.MODEL_PIPELINE
-    initializer = app.state.MODEL_INITIALIZER
-
-    utils_app = app.state.utils_app
-
-    if not wrapper or not wrapper.pipeline:
-        raise HTTPException(500, "Model not initialized correctly")
-    if not prompt.strip():
-        raise HTTPException(400, "No prompt provided")
-
-    def make_generator():
-        g = torch.Generator(device=initializer.device)
-        return g.manual_seed(random.randint(0, 10_000_000))
-
-    req_pipe = app.state.REQUEST_PIPE
-
-    def infer():
-        gen = make_generator()
-        return req_pipe.generate(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            generator=gen,
-            num_inference_steps=num_steps,
-            num_images_per_prompt=num_images_per_prompt,
-            device=initializer.device,
-            output_type="pil",
-        )
-
-    try:
-        async with app.state.metrics_lock:
-            app.state.active_inferences += 1
-
-        output = await run_in_threadpool(infer)
-
-        async with app.state.metrics_lock:
-            app.state.active_inferences = max(0, app.state.active_inferences - 1)
-
-        urls = [utils_app.save_image(img) for img in output.images]
-        return {"response": urls}
-
-    except Exception as e:
-        async with app.state.metrics_lock:
-            app.state.active_inferences = max(0, app.state.active_inferences - 1)
-        logger.error(f"Error during inference: {e}")
-        raise HTTPException(500, f"Error in processing: {e}")
-
-    finally:
-        if torch.cuda.is_available():
-            torch.cuda.synchronize()
-            torch.cuda.empty_cache()
-            torch.cuda.reset_peak_memory_stats()
-            torch.cuda.ipc_collect()
-        gc.collect()
-
-
-@app.get("/images/{filename}")
-async def serve_image(filename: str):
-    utils_app = app.state.utils_app
-    file_path = os.path.join(utils_app.image_dir, filename)
-    if not os.path.isfile(file_path):
-        raise HTTPException(status_code=404, detail="Image not found")
-    return FileResponse(file_path, media_type="image/png")
-
-
-@app.get("/api/status")
-async def get_status():
-    memory_info = {}
-    if torch.cuda.is_available():
-        memory_allocated = torch.cuda.memory_allocated() / 1024**3  # GB
-        memory_reserved = torch.cuda.memory_reserved() / 1024**3  # GB
-        memory_info = {
-            "memory_allocated_gb": round(memory_allocated, 2),
-            "memory_reserved_gb": round(memory_reserved, 2),
-            "device": torch.cuda.get_device_name(0),
-        }
-
-    return {"current_model": server_config.model, "type_models": server_config.type_models, "memory": memory_info}
-
-
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-if __name__ == "__main__":
-    import uvicorn
-
-    uvicorn.run(app, host=server_config.host, port=server_config.port)
@@ -1,65 +0,0 @@
-import os
-import time
-import urllib.parse
-
-import requests
-
-
-SERVER_URL = "http://localhost:8500/api/diffusers/inference"
-BASE_URL = "http://localhost:8500"
-DOWNLOAD_FOLDER = "generated_images"
-WAIT_BEFORE_DOWNLOAD = 2  # seconds
-
-os.makedirs(DOWNLOAD_FOLDER, exist_ok=True)
-
-
-def save_from_url(url: str) -> str:
-    """Download the given URL (relative or absolute) and save it locally."""
-    if url.startswith("/"):
-        direct = BASE_URL.rstrip("/") + url
-    else:
-        direct = url
-    resp = requests.get(direct, timeout=60)
-    resp.raise_for_status()
-    filename = os.path.basename(urllib.parse.urlparse(direct).path) or f"img_{int(time.time())}.png"
-    path = os.path.join(DOWNLOAD_FOLDER, filename)
-    with open(path, "wb") as f:
-        f.write(resp.content)
-    return path
-
-
-def main():
-    payload = {
-        "prompt": "The T-800 Terminator Robot Returning From The Future, Anime Style",
-        "num_inference_steps": 30,
-        "num_images_per_prompt": 1,
-    }
-
-    print("Sending request...")
-    try:
-        r = requests.post(SERVER_URL, json=payload, timeout=480)
-        r.raise_for_status()
-    except Exception as e:
-        print(f"Request failed: {e}")
-        return
-
-    body = r.json().get("response", [])
-    # Normalize to a list
-    urls = body if isinstance(body, list) else [body] if body else []
-    if not urls:
-        print("No URLs found in the response. Check the server output.")
-        return
-
-    print(f"Received {len(urls)} URL(s). Waiting {WAIT_BEFORE_DOWNLOAD}s before downloading...")
-    time.sleep(WAIT_BEFORE_DOWNLOAD)
-
-    for u in urls:
-        try:
-            path = save_from_url(u)
-            print(f"Image saved to: {path}")
-        except Exception as e:
-            print(f"Error downloading {u}: {e}")
-
-
-if __name__ == "__main__":
-    main()
@@ -1,2 +0,0 @@
-from .requestscopedpipeline import RequestScopedPipeline
-from .utils import Utils
@@ -1,296 +0,0 @@
-import copy
-import threading
-from typing import Any, Iterable, List, Optional
-
-import torch
-
-from diffusers.utils import logging
-
-from .scheduler import BaseAsyncScheduler, async_retrieve_timesteps
-
-
-logger = logging.get_logger(__name__)
-
-
-def safe_tokenize(tokenizer, *args, lock, **kwargs):
-    with lock:
-        return tokenizer(*args, **kwargs)
-
-
-class RequestScopedPipeline:
-    DEFAULT_MUTABLE_ATTRS = [
-        "_all_hooks",
-        "_offload_device",
-        "_progress_bar_config",
-        "_progress_bar",
-        "_rng_state",
-        "_last_seed",
-        "latents",
-    ]
-
-    def __init__(
-        self,
-        pipeline: Any,
-        mutable_attrs: Optional[Iterable[str]] = None,
-        auto_detect_mutables: bool = True,
-        tensor_numel_threshold: int = 1_000_000,
-        tokenizer_lock: Optional[threading.Lock] = None,
-        wrap_scheduler: bool = True,
-    ):
-        self._base = pipeline
-        self.unet = getattr(pipeline, "unet", None)
-        self.vae = getattr(pipeline, "vae", None)
-        self.text_encoder = getattr(pipeline, "text_encoder", None)
-        self.components = getattr(pipeline, "components", None)
-
-        if wrap_scheduler and hasattr(pipeline, "scheduler") and pipeline.scheduler is not None:
-            if not isinstance(pipeline.scheduler, BaseAsyncScheduler):
-                pipeline.scheduler = BaseAsyncScheduler(pipeline.scheduler)
-
-        self._mutable_attrs = list(mutable_attrs) if mutable_attrs is not None else list(self.DEFAULT_MUTABLE_ATTRS)
-        self._tokenizer_lock = tokenizer_lock if tokenizer_lock is not None else threading.Lock()
-
-        self._auto_detect_mutables = bool(auto_detect_mutables)
-        self._tensor_numel_threshold = int(tensor_numel_threshold)
-
-        self._auto_detected_attrs: List[str] = []
-
-    def _make_local_scheduler(self, num_inference_steps: int, device: Optional[str] = None, **clone_kwargs):
-        base_sched = getattr(self._base, "scheduler", None)
-        if base_sched is None:
-            return None
-
-        if not isinstance(base_sched, BaseAsyncScheduler):
-            wrapped_scheduler = BaseAsyncScheduler(base_sched)
-        else:
-            wrapped_scheduler = base_sched
-
-        try:
-            return wrapped_scheduler.clone_for_request(
-                num_inference_steps=num_inference_steps, device=device, **clone_kwargs
-            )
-        except Exception as e:
-            logger.debug(f"clone_for_request failed: {e}; falling back to deepcopy()")
-            try:
-                return copy.deepcopy(wrapped_scheduler)
-            except Exception as e:
-                logger.warning(f"Deepcopy of scheduler failed: {e}. Returning original scheduler (*risky*).")
-                return wrapped_scheduler
-
-    def _autodetect_mutables(self, max_attrs: int = 40):
-        if not self._auto_detect_mutables:
-            return []
-
-        if self._auto_detected_attrs:
-            return self._auto_detected_attrs
-
-        candidates: List[str] = []
-        seen = set()
-        for name in dir(self._base):
-            if name.startswith("__"):
-                continue
-            if name in self._mutable_attrs:
-                continue
-            if name in ("to", "save_pretrained", "from_pretrained"):
-                continue
-            try:
-                val = getattr(self._base, name)
-            except Exception:
-                continue
-
-            import types
-
-            # skip callables and modules
-            if callable(val) or isinstance(val, (types.ModuleType, types.FunctionType, types.MethodType)):
-                continue
-
-            # containers -> candidate
-            if isinstance(val, (dict, list, set, tuple, bytearray)):
-                candidates.append(name)
-                seen.add(name)
-            else:
-                # try Tensor detection
-                try:
-                    if isinstance(val, torch.Tensor):
-                        if val.numel() <= self._tensor_numel_threshold:
-                            candidates.append(name)
-                            seen.add(name)
-                        else:
-                            logger.debug(f"Ignoring large tensor attr '{name}', numel={val.numel()}")
-                except Exception:
-                    continue
-
-            if len(candidates) >= max_attrs:
-                break
-
-        self._auto_detected_attrs = candidates
-        logger.debug(f"Autodetected mutable attrs to clone: {self._auto_detected_attrs}")
-        return self._auto_detected_attrs
-
-    def _is_readonly_property(self, base_obj, attr_name: str) -> bool:
-        try:
-            cls = type(base_obj)
-            descriptor = getattr(cls, attr_name, None)
-            if isinstance(descriptor, property):
-                return descriptor.fset is None
-            if hasattr(descriptor, "__set__") is False and descriptor is not None:
-                return False
-        except Exception:
-            pass
-        return False
-
-    def _clone_mutable_attrs(self, base, local):
-        attrs_to_clone = list(self._mutable_attrs)
-        attrs_to_clone.extend(self._autodetect_mutables())
-
-        EXCLUDE_ATTRS = {
-            "components",
-        }
-
-        for attr in attrs_to_clone:
-            if attr in EXCLUDE_ATTRS:
-                logger.debug(f"Skipping excluded attr '{attr}'")
-                continue
-            if not hasattr(base, attr):
-                continue
-            if self._is_readonly_property(base, attr):
-                logger.debug(f"Skipping read-only property '{attr}'")
-                continue
-
-            try:
-                val = getattr(base, attr)
-            except Exception as e:
-                logger.debug(f"Could not getattr('{attr}') on base pipeline: {e}")
-                continue
-
-            try:
-                if isinstance(val, dict):
-                    setattr(local, attr, dict(val))
-                elif isinstance(val, (list, tuple, set)):
-                    setattr(local, attr, list(val))
-                elif isinstance(val, bytearray):
-                    setattr(local, attr, bytearray(val))
-                else:
-                    # small tensors or atomic values
-                    if isinstance(val, torch.Tensor):
-                        if val.numel() <= self._tensor_numel_threshold:
-                            setattr(local, attr, val.clone())
-                        else:
-                            # don't clone big tensors, keep reference
-                            setattr(local, attr, val)
-                    else:
-                        try:
-                            setattr(local, attr, copy.copy(val))
-                        except Exception:
-                            setattr(local, attr, val)
-            except (AttributeError, TypeError) as e:
-                logger.debug(f"Skipping cloning attribute '{attr}' because it is not settable: {e}")
-                continue
-            except Exception as e:
-                logger.debug(f"Unexpected error cloning attribute '{attr}': {e}")
-                continue
-
-    def _is_tokenizer_component(self, component) -> bool:
-        if component is None:
-            return False
-
-        tokenizer_methods = ["encode", "decode", "tokenize", "__call__"]
-        has_tokenizer_methods = any(hasattr(component, method) for method in tokenizer_methods)
-
-        class_name = component.__class__.__name__.lower()
-        has_tokenizer_in_name = "tokenizer" in class_name
-
-        tokenizer_attrs = ["vocab_size", "pad_token", "eos_token", "bos_token"]
-        has_tokenizer_attrs = any(hasattr(component, attr) for attr in tokenizer_attrs)
-
-        return has_tokenizer_methods and (has_tokenizer_in_name or has_tokenizer_attrs)
-
-    def generate(self, *args, num_inference_steps: int = 50, device: Optional[str] = None, **kwargs):
-        local_scheduler = self._make_local_scheduler(num_inference_steps=num_inference_steps, device=device)
-
-        try:
-            local_pipe = copy.copy(self._base)
-        except Exception as e:
-            logger.warning(f"copy.copy(self._base) failed: {e}. Falling back to deepcopy (may increase memory).")
-            local_pipe = copy.deepcopy(self._base)
-
-        if local_scheduler is not None:
-            try:
-                timesteps, num_steps, configured_scheduler = async_retrieve_timesteps(
-                    local_scheduler.scheduler,
-                    num_inference_steps=num_inference_steps,
-                    device=device,
-                    return_scheduler=True,
-                    **{k: v for k, v in kwargs.items() if k in ["timesteps", "sigmas"]},
-                )
-
-                final_scheduler = BaseAsyncScheduler(configured_scheduler)
-                setattr(local_pipe, "scheduler", final_scheduler)
-            except Exception:
-                logger.warning("Could not set scheduler on local pipe; proceeding without replacing scheduler.")
-
-        self._clone_mutable_attrs(self._base, local_pipe)
-
-        # 4) wrap tokenizers on the local pipe with the lock wrapper
-        tokenizer_wrappers = {}  # name -> original_tokenizer
-        try:
-            # a) wrap direct tokenizer attributes (tokenizer, tokenizer_2, ...)
-            for name in dir(local_pipe):
-                if "tokenizer" in name and not name.startswith("_"):
-                    tok = getattr(local_pipe, name, None)
-                    if tok is not None and self._is_tokenizer_component(tok):
-                        tokenizer_wrappers[name] = tok
-                        setattr(
-                            local_pipe,
-                            name,
-                            lambda *args, tok=tok, **kwargs: safe_tokenize(
-                                tok, *args, lock=self._tokenizer_lock, **kwargs
-                            ),
-                        )
-
-            # b) wrap tokenizers in components dict
-            if hasattr(local_pipe, "components") and isinstance(local_pipe.components, dict):
-                for key, val in local_pipe.components.items():
-                    if val is None:
-                        continue
-
-                    if self._is_tokenizer_component(val):
-                        tokenizer_wrappers[f"components[{key}]"] = val
-                        local_pipe.components[key] = lambda *args, tokenizer=val, **kwargs: safe_tokenize(
-                            tokenizer, *args, lock=self._tokenizer_lock, **kwargs
-                        )
-
-        except Exception as e:
-            logger.debug(f"Tokenizer wrapping step encountered an error: {e}")
-
-        result = None
-        cm = getattr(local_pipe, "model_cpu_offload_context", None)
-        try:
-            if callable(cm):
-                try:
-                    with cm():
-                        result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
-                except TypeError:
-                    # cm might be a context manager instance rather than callable
-                    try:
-                        with cm:
-                            result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
-                    except Exception as e:
-                        logger.debug(f"model_cpu_offload_context usage failed: {e}. Proceeding without it.")
-                        result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
-            else:
-                # no offload context available — call directly
-                result = local_pipe(*args, num_inference_steps=num_inference_steps, **kwargs)
-
-            return result
-
-        finally:
-            try:
-                for name, tok in tokenizer_wrappers.items():
-                    if name.startswith("components["):
-                        key = name[len("components[") : -1]
-                        local_pipe.components[key] = tok
-                    else:
-                        setattr(local_pipe, name, tok)
-            except Exception as e:
-                logger.debug(f"Error restoring wrapped tokenizers: {e}")
@@ -1,141 +0,0 @@
-import copy
-import inspect
-from typing import Any, List, Optional, Union
-
-import torch
-
-
-class BaseAsyncScheduler:
-    def __init__(self, scheduler: Any):
-        self.scheduler = scheduler
-
-    def __getattr__(self, name: str):
-        if hasattr(self.scheduler, name):
-            return getattr(self.scheduler, name)
-        raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
-
-    def __setattr__(self, name: str, value):
-        if name == "scheduler":
-            super().__setattr__(name, value)
-        else:
-            if hasattr(self, "scheduler") and hasattr(self.scheduler, name):
-                setattr(self.scheduler, name, value)
-            else:
-                super().__setattr__(name, value)
-
-    def clone_for_request(self, num_inference_steps: int, device: Union[str, torch.device, None] = None, **kwargs):
-        local = copy.deepcopy(self.scheduler)
-        local.set_timesteps(num_inference_steps=num_inference_steps, device=device, **kwargs)
-        cloned = self.__class__(local)
-        return cloned
-
-    def __repr__(self):
-        return f"BaseAsyncScheduler({repr(self.scheduler)})"
-
-    def __str__(self):
-        return f"BaseAsyncScheduler wrapping: {str(self.scheduler)}"
-
-
-def async_retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    sigmas: Optional[List[float]] = None,
-    **kwargs,
-):
-    r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call.
-    Handles custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Backwards compatible: by default the function behaves exactly as before and returns
-        (timesteps_tensor, num_inference_steps)
-
-    If the caller passes `return_scheduler=True` in kwargs, the function will **not** mutate the passed
-    scheduler. Instead it will use a cloned scheduler if available (via `scheduler.clone_for_request`)
-    or a deepcopy fallback, call `set_timesteps` on that cloned scheduler, and return:
-        (timesteps_tensor, num_inference_steps, scheduler_in_use)
-
-    Args:
-        scheduler (`SchedulerMixin`):
-            The scheduler to get timesteps from.
-        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            must be `None`.
-        device (`str` or `torch.device`, *optional*):
-            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
-            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
-            `num_inference_steps` and `sigmas` must be `None`.
-        sigmas (`List[float]`, *optional*):
-            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
-            `num_inference_steps` and `timesteps` must be `None`.
-
-    Optional kwargs:
-        return_scheduler (bool, default False): if True, return (timesteps, num_inference_steps, scheduler_in_use)
-            where `scheduler_in_use` is a scheduler instance that already has timesteps set.
-            This mode will prefer `scheduler.clone_for_request(...)` if available, to avoid mutating the original scheduler.
-
-    Returns:
-        `(timesteps_tensor, num_inference_steps)` by default (backwards compatible), or
-        `(timesteps_tensor, num_inference_steps, scheduler_in_use)` if `return_scheduler=True`.
-    """
-    # pop our optional control kwarg (keeps compatibility)
-    return_scheduler = bool(kwargs.pop("return_scheduler", False))
-
-    if timesteps is not None and sigmas is not None:
-        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-
-    # choose scheduler to call set_timesteps on
-    scheduler_in_use = scheduler
-    if return_scheduler:
-        # Do not mutate the provided scheduler: prefer to clone if possible
-        if hasattr(scheduler, "clone_for_request"):
-            try:
-                # clone_for_request may accept num_inference_steps or other kwargs; be permissive
-                scheduler_in_use = scheduler.clone_for_request(
-                    num_inference_steps=num_inference_steps or 0, device=device
-                )
-            except Exception:
-                scheduler_in_use = copy.deepcopy(scheduler)
-        else:
-            # fallback deepcopy (scheduler tends to be smallish - acceptable)
-            scheduler_in_use = copy.deepcopy(scheduler)
-
-    # helper to test if set_timesteps supports a particular kwarg
-    def _accepts(param_name: str) -> bool:
-        try:
-            return param_name in set(inspect.signature(scheduler_in_use.set_timesteps).parameters.keys())
-        except (ValueError, TypeError):
-            # if signature introspection fails, be permissive and attempt the call later
-            return False
-
-    # now call set_timesteps on the chosen scheduler_in_use (may be original or clone)
-    if timesteps is not None:
-        accepts_timesteps = _accepts("timesteps")
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler_in_use.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-        num_inference_steps = len(timesteps_out)
-    elif sigmas is not None:
-        accept_sigmas = _accepts("sigmas")
-        if not accept_sigmas:
-            raise ValueError(
-                f"The current scheduler class {scheduler_in_use.__class__}'s `set_timesteps` does not support custom"
-                f" sigmas schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler_in_use.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-        num_inference_steps = len(timesteps_out)
-    else:
-        # default path
-        scheduler_in_use.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps_out = scheduler_in_use.timesteps
-
-    if return_scheduler:
-        return timesteps_out, num_inference_steps, scheduler_in_use
-    return timesteps_out, num_inference_steps
@@ -1,48 +0,0 @@
-import gc
-import logging
-import os
-import tempfile
-import uuid
-
-import torch
-
-
-logger = logging.getLogger(__name__)
-
-
-class Utils:
-    def __init__(self, host: str = "0.0.0.0", port: int = 8500):
-        self.service_url = f"http://{host}:{port}"
-        self.image_dir = os.path.join(tempfile.gettempdir(), "images")
-        if not os.path.exists(self.image_dir):
-            os.makedirs(self.image_dir)
-
-        self.video_dir = os.path.join(tempfile.gettempdir(), "videos")
-        if not os.path.exists(self.video_dir):
-            os.makedirs(self.video_dir)
-
-    def save_image(self, image):
-        if hasattr(image, "to"):
-            try:
-                image = image.to("cpu")
-            except Exception:
-                pass
-
-        if isinstance(image, torch.Tensor):
-            from torchvision import transforms
-
-            to_pil = transforms.ToPILImage()
-            image = to_pil(image.squeeze(0).clamp(0, 1))
-
-        filename = "img" + str(uuid.uuid4()).split("-")[0] + ".png"
-        image_path = os.path.join(self.image_dir, filename)
-        logger.info(f"Saving image to {image_path}")
-
-        image.save(image_path, format="PNG", optimize=True)
-
-        del image
-        gc.collect()
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-
-        return os.path.join(self.service_url, "images", filename)
@@ -9,8 +9,8 @@ This guide will show you how to use the [`StableDiffusion3Pipeline`] in a server
 Start by navigating to the `examples/server` folder and installing all of the dependencies.

 ```py
-pip install diffusers
-pip install -r requirements.txt
+pip install .
+pip install -f requirements.txt
 ```

 Launch the server with the following command.
@@ -6,5 +6,4 @@ py-consul
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 fastapi
-uvicorn
-accelerate
+uvicorn
@@ -39,7 +39,7 @@ fsspec==2024.10.0
    #   torch
 h11==0.14.0
    # via uvicorn
-huggingface-hub==0.35.0
+huggingface-hub==0.26.1
    # via
    #   tokenizers
    #   transformers
@@ -278,29 +278,6 @@ def get_transformer_config(model_type: str) -> Tuple[Dict[str, Any], ...]:
        }
        RENAME_DICT = VACE_TRANSFORMER_KEYS_RENAME_DICT
        SPECIAL_KEYS_REMAP = VACE_TRANSFORMER_SPECIAL_KEYS_REMAP
-    elif model_type == "Wan2.2-VACE-Fun-14B":
-        config = {
-            "model_id": "alibaba-pai/Wan2.2-VACE-Fun-A14B",
-            "diffusers_config": {
-                "added_kv_proj_dim": None,
-                "attention_head_dim": 128,
-                "cross_attn_norm": True,
-                "eps": 1e-06,
-                "ffn_dim": 13824,
-                "freq_dim": 256,
-                "in_channels": 16,
-                "num_attention_heads": 40,
-                "num_layers": 40,
-                "out_channels": 16,
-                "patch_size": [1, 2, 2],
-                "qk_norm": "rms_norm_across_heads",
-                "text_dim": 4096,
-                "vace_layers": [0, 5, 10, 15, 20, 25, 30, 35],
-                "vace_in_channels": 96,
-            },
-        }
-        RENAME_DICT = VACE_TRANSFORMER_KEYS_RENAME_DICT
-        SPECIAL_KEYS_REMAP = VACE_TRANSFORMER_SPECIAL_KEYS_REMAP
    elif model_type == "Wan2.2-I2V-14B-720p":
        config = {
            "model_id": "Wan-AI/Wan2.2-I2V-A14B",
@@ -998,17 +975,7 @@ if __name__ == "__main__":
            image_encoder=image_encoder,
            image_processor=image_processor,
        )
-    elif "Wan2.2-VACE" in args.model_type:
-        pipe = WanVACEPipeline(
-            transformer=transformer,
-            transformer_2=transformer_2,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            vae=vae,
-            scheduler=scheduler,
-            boundary_ratio=0.875,
-        )
-    elif "Wan-VACE" in args.model_type:
+    elif "VACE" in args.model_type:
        pipe = WanVACEPipeline(
            transformer=transformer,
            text_encoder=text_encoder,
@@ -102,8 +102,7 @@ _deps = [
    "filelock",
    "flax>=0.4.1",
    "hf-doc-builder>=0.3.0",
-    "httpx<1.0.0",
-    "huggingface-hub>=0.34.0,<2.0",
+    "huggingface-hub>=0.34.0",
    "requests-mock==1.10.0",
    "importlib_metadata",
    "invisible-watermark>=0.2.0",
@@ -133,7 +132,6 @@ _deps = [
    "gguf>=0.10.0",
    "torchao>=0.7.0",
    "bitsandbytes>=0.43.3",
-    "nvidia_modelopt[hf]>=0.33.1",
    "regex!=2019.12.17",
    "requests",
    "tensorboard",
@@ -246,7 +244,6 @@ extras["bitsandbytes"] = deps_list("bitsandbytes", "accelerate")
 extras["gguf"] = deps_list("gguf", "accelerate")
 extras["optimum_quanto"] = deps_list("optimum_quanto", "accelerate")
 extras["torchao"] = deps_list("torchao", "accelerate")
-extras["nvidia_modelopt"] = deps_list("nvidia_modelopt[hf]")

 if os.name == "nt":  # windows
    extras["flax"] = []  # jax is not supported on windows
@@ -260,7 +257,6 @@ extras["dev"] = (
 install_requires = [
    deps["importlib_metadata"],
    deps["filelock"],
-    deps["httpx"],
    deps["huggingface-hub"],
    deps["numpy"],
    deps["regex"],
@@ -13,7 +13,6 @@ from .utils import (
    is_k_diffusion_available,
    is_librosa_available,
    is_note_seq_available,
-    is_nvidia_modelopt_available,
    is_onnx_available,
    is_opencv_available,
    is_optimum_quanto_available,
@@ -112,18 +111,6 @@ except OptionalDependencyNotAvailable:
 else:
    _import_structure["quantizers.quantization_config"].append("QuantoConfig")

-try:
-    if not is_torch_available() and not is_accelerate_available() and not is_nvidia_modelopt_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils import dummy_nvidia_modelopt_objects
-
-    _import_structure["utils.dummy_nvidia_modelopt_objects"] = [
-        name for name in dir(dummy_nvidia_modelopt_objects) if not name.startswith("_")
-    ]
-else:
-    _import_structure["quantizers.quantization_config"].append("NVIDIAModelOptConfig")
-
 try:
    if not is_onnx_available():
        raise OptionalDependencyNotAvailable()
@@ -202,7 +189,6 @@ else:
            "CogView4Transformer2DModel",
            "ConsisIDTransformer3DModel",
            "ConsistencyDecoderVAE",
-            "ContextParallelConfig",
            "ControlNetModel",
            "ControlNetUnionModel",
            "ControlNetXSAdapter",
@@ -230,7 +216,6 @@ else:
            "MultiAdapter",
            "MultiControlNetModel",
            "OmniGenTransformer2DModel",
-            "ParallelConfig",
            "PixArtTransformer2DModel",
            "PriorTransformer",
            "QwenImageControlNetModel",
@@ -387,10 +372,6 @@ else:
        [
            "FluxAutoBlocks",
            "FluxModularPipeline",
-            "QwenImageAutoBlocks",
-            "QwenImageEditAutoBlocks",
-            "QwenImageEditModularPipeline",
-            "QwenImageModularPipeline",
            "StableDiffusionXLAutoBlocks",
            "StableDiffusionXLModularPipeline",
            "WanAutoBlocks",
@@ -497,7 +478,6 @@ else:
            "LTXImageToVideoPipeline",
            "LTXLatentUpsamplePipeline",
            "LTXPipeline",
-            "LucyEditPipeline",
            "Lumina2Pipeline",
            "Lumina2Text2ImgPipeline",
            "LuminaPipeline",
@@ -513,11 +493,8 @@ else:
            "PixArtAlphaPipeline",
            "PixArtSigmaPAGPipeline",
            "PixArtSigmaPipeline",
-            "QwenImageControlNetInpaintPipeline",
            "QwenImageControlNetPipeline",
-            "QwenImageEditInpaintPipeline",
            "QwenImageEditPipeline",
-            "QwenImageEditPlusPipeline",
            "QwenImageImg2ImgPipeline",
            "QwenImageInpaintPipeline",
            "QwenImagePipeline",
@@ -817,14 +794,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    else:
        from .quantizers.quantization_config import QuantoConfig

-    try:
-        if not is_nvidia_modelopt_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        from .utils.dummy_nvidia_modelopt_objects import *
-    else:
-        from .quantizers.quantization_config import NVIDIAModelOptConfig
-
    try:
        if not is_onnx_available():
            raise OptionalDependencyNotAvailable()
@@ -890,7 +859,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            CogView4Transformer2DModel,
            ConsisIDTransformer3DModel,
            ConsistencyDecoderVAE,
-            ContextParallelConfig,
            ControlNetModel,
            ControlNetUnionModel,
            ControlNetXSAdapter,
@@ -918,7 +886,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            MultiAdapter,
            MultiControlNetModel,
            OmniGenTransformer2DModel,
-            ParallelConfig,
            PixArtTransformer2DModel,
            PriorTransformer,
            QwenImageControlNetModel,
@@ -1049,10 +1016,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .modular_pipelines import (
            FluxAutoBlocks,
            FluxModularPipeline,
-            QwenImageAutoBlocks,
-            QwenImageEditAutoBlocks,
-            QwenImageEditModularPipeline,
-            QwenImageModularPipeline,
            StableDiffusionXLAutoBlocks,
            StableDiffusionXLModularPipeline,
            WanAutoBlocks,
@@ -1155,7 +1118,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            LTXImageToVideoPipeline,
            LTXLatentUpsamplePipeline,
            LTXPipeline,
-            LucyEditPipeline,
            Lumina2Pipeline,
            Lumina2Text2ImgPipeline,
            LuminaPipeline,
@@ -1171,11 +1133,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            PixArtAlphaPipeline,
            PixArtSigmaPAGPipeline,
            PixArtSigmaPipeline,
-            QwenImageControlNetInpaintPipeline,
            QwenImageControlNetPipeline,
-            QwenImageEditInpaintPipeline,
            QwenImageEditPipeline,
-            QwenImageEditPlusPipeline,
            QwenImageImg2ImgPipeline,
            QwenImageInpaintPipeline,
            QwenImagePipeline,
@@ -30,11 +30,11 @@ import numpy as np
 from huggingface_hub import DDUFEntry, create_repo, hf_hub_download
 from huggingface_hub.utils import (
    EntryNotFoundError,
-    HfHubHTTPError,
    RepositoryNotFoundError,
    RevisionNotFoundError,
    validate_hf_hub_args,
 )
+from requests import HTTPError
 from typing_extensions import Self

 from . import __version__
@@ -419,7 +419,7 @@ class ConfigMixin:
                raise EnvironmentError(
                    f"{pretrained_model_name_or_path} does not appear to have a file named {cls.config_name}."
                )
-            except HfHubHTTPError as err:
+            except HTTPError as err:
                raise EnvironmentError(
                    "There was a specific connection error when trying to load"
                    f" {pretrained_model_name_or_path}:\n{err}"
@@ -9,8 +9,7 @@ deps = {
    "filelock": "filelock",
    "flax": "flax>=0.4.1",
    "hf-doc-builder": "hf-doc-builder>=0.3.0",
-    "httpx": "httpx<1.0.0",
-    "huggingface-hub": "huggingface-hub>=0.34.0,<2.0",
+    "huggingface-hub": "huggingface-hub>=0.34.0",
    "requests-mock": "requests-mock==1.10.0",
    "importlib_metadata": "importlib_metadata",
    "invisible-watermark": "invisible-watermark>=0.2.0",
@@ -40,7 +39,6 @@ deps = {
    "gguf": "gguf>=0.10.0",
    "torchao": "torchao>=0.7.0",
    "bitsandbytes": "bitsandbytes>=0.43.3",
-    "nvidia_modelopt[hf]": "nvidia_modelopt[hf]>=0.33.1",
    "regex": "regex!=2019.12.17",
    "requests": "requests",
    "tensorboard": "tensorboard",
@@ -82,15 +82,15 @@ class AutoGuidance(BaseGuidance):
        self.guidance_rescale = guidance_rescale
        self.use_original_formulation = use_original_formulation

-        is_layer_or_config_provided = auto_guidance_layers is not None or auto_guidance_config is not None
-        is_layer_and_config_provided = auto_guidance_layers is not None and auto_guidance_config is not None
-        if not is_layer_or_config_provided:
+        if auto_guidance_layers is None and auto_guidance_config is None:
            raise ValueError(
-                "Either `auto_guidance_layers` or `auto_guidance_config` must be provided to enable AutoGuidance."
+                "Either `auto_guidance_layers` or `auto_guidance_config` must be provided to enable Skip Layer Guidance."
            )
-        if is_layer_and_config_provided:
+        if auto_guidance_layers is not None and auto_guidance_config is not None:
            raise ValueError("Only one of `auto_guidance_layers` or `auto_guidance_config` can be provided.")
-        if auto_guidance_config is None and dropout is None:
+        if (dropout is None and auto_guidance_layers is not None) or (
+            dropout is not None and auto_guidance_layers is None
+        ):
            raise ValueError("`dropout` must be provided if `auto_guidance_layers` is provided.")

        if auto_guidance_layers is not None:
@@ -61,7 +61,7 @@ def project(v0: torch.Tensor, v1: torch.Tensor, upcast_to_double: bool = True) -
 def build_image_from_pyramid(pyramid: List[torch.Tensor]) -> torch.Tensor:
    """
    Recovers the data space latents from the Laplacian pyramid frequency space. Implementation from the paper
-    (Algorithm 2).
+    (Algorihtm 2).
    """
    # pyramid shapes: [[B, C, H, W], [B, C, H/2, W/2], ...]
    img = pyramid[-1]
@@ -16,7 +16,6 @@ from ..utils import is_torch_available


 if is_torch_available():
-    from .context_parallel import apply_context_parallel
    from .faster_cache import FasterCacheConfig, apply_faster_cache
    from .first_block_cache import FirstBlockCacheConfig, apply_first_block_cache
    from .group_offloading import apply_group_offloading
@@ -108,7 +108,6 @@ def _register_attention_processors_metadata():
    from ..models.attention_processor import AttnProcessor2_0
    from ..models.transformers.transformer_cogview4 import CogView4AttnProcessor
    from ..models.transformers.transformer_flux import FluxAttnProcessor
-    from ..models.transformers.transformer_qwenimage import QwenDoubleStreamAttnProcessor2_0
    from ..models.transformers.transformer_wan import WanAttnProcessor2_0

    # AttnProcessor2_0
@@ -141,14 +140,6 @@ def _register_attention_processors_metadata():
        metadata=AttentionProcessorMetadata(skip_processor_output_fn=_skip_proc_output_fn_Attention_FluxAttnProcessor),
    )

-    # QwenDoubleStreamAttnProcessor2
-    AttentionProcessorRegistry.register(
-        model_class=QwenDoubleStreamAttnProcessor2_0,
-        metadata=AttentionProcessorMetadata(
-            skip_processor_output_fn=_skip_proc_output_fn_Attention_QwenDoubleStreamAttnProcessor2_0
-        ),
-    )
-

 def _register_transformer_blocks_metadata():
    from ..models.attention import BasicTransformerBlock
@@ -307,5 +298,4 @@ _skip_proc_output_fn_Attention_CogView4AttnProcessor = _skip_attention___ret___h
 _skip_proc_output_fn_Attention_WanAttnProcessor2_0 = _skip_attention___ret___hidden_states
 # not sure what this is yet.
 _skip_proc_output_fn_Attention_FluxAttnProcessor = _skip_attention___ret___hidden_states
-_skip_proc_output_fn_Attention_QwenDoubleStreamAttnProcessor2_0 = _skip_attention___ret___hidden_states
 # fmt: on
@@ -1,297 +0,0 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from dataclasses import dataclass
-from typing import Dict, List, Type, Union
-
-import torch
-import torch.distributed._functional_collectives as funcol
-
-from ..models._modeling_parallel import (
-    ContextParallelConfig,
-    ContextParallelInput,
-    ContextParallelModelPlan,
-    ContextParallelOutput,
-)
-from ..utils import get_logger
-from ..utils.torch_utils import unwrap_module
-from .hooks import HookRegistry, ModelHook
-
-
-logger = get_logger(__name__)  # pylint: disable=invalid-name
-
-_CONTEXT_PARALLEL_INPUT_HOOK_TEMPLATE = "cp_input---{}"
-_CONTEXT_PARALLEL_OUTPUT_HOOK_TEMPLATE = "cp_output---{}"
-
-
-# TODO(aryan): consolidate with ._helpers.TransformerBlockMetadata
-@dataclass
-class ModuleForwardMetadata:
-    cached_parameter_indices: Dict[str, int] = None
-    _cls: Type = None
-
-    def _get_parameter_from_args_kwargs(self, identifier: str, args=(), kwargs=None):
-        kwargs = kwargs or {}
-
-        if identifier in kwargs:
-            return kwargs[identifier], True, None
-
-        if self.cached_parameter_indices is not None:
-            index = self.cached_parameter_indices.get(identifier, None)
-            if index is None:
-                raise ValueError(f"Parameter '{identifier}' not found in cached indices.")
-            return args[index], False, index
-
-        if self._cls is None:
-            raise ValueError("Model class is not set for metadata.")
-
-        parameters = list(inspect.signature(self._cls.forward).parameters.keys())
-        parameters = parameters[1:]  # skip `self`
-        self.cached_parameter_indices = {param: i for i, param in enumerate(parameters)}
-
-        if identifier not in self.cached_parameter_indices:
-            raise ValueError(f"Parameter '{identifier}' not found in function signature but was requested.")
-
-        index = self.cached_parameter_indices[identifier]
-
-        if index >= len(args):
-            raise ValueError(f"Expected {index} arguments but got {len(args)}.")
-
-        return args[index], False, index
-
-
-def apply_context_parallel(
-    module: torch.nn.Module,
-    parallel_config: ContextParallelConfig,
-    plan: Dict[str, ContextParallelModelPlan],
-) -> None:
-    """Apply context parallel on a model."""
-    logger.debug(f"Applying context parallel with CP mesh: {parallel_config._mesh} and plan: {plan}")
-
-    for module_id, cp_model_plan in plan.items():
-        submodule = _get_submodule_by_name(module, module_id)
-        if not isinstance(submodule, list):
-            submodule = [submodule]
-
-        logger.debug(f"Applying ContextParallelHook to {module_id=} identifying a total of {len(submodule)} modules")
-
-        for m in submodule:
-            if isinstance(cp_model_plan, dict):
-                hook = ContextParallelSplitHook(cp_model_plan, parallel_config)
-                hook_name = _CONTEXT_PARALLEL_INPUT_HOOK_TEMPLATE.format(module_id)
-            elif isinstance(cp_model_plan, (ContextParallelOutput, list, tuple)):
-                if isinstance(cp_model_plan, ContextParallelOutput):
-                    cp_model_plan = [cp_model_plan]
-                if not all(isinstance(x, ContextParallelOutput) for x in cp_model_plan):
-                    raise ValueError(f"Expected all elements of cp_model_plan to be CPOutput, but got {cp_model_plan}")
-                hook = ContextParallelGatherHook(cp_model_plan, parallel_config)
-                hook_name = _CONTEXT_PARALLEL_OUTPUT_HOOK_TEMPLATE.format(module_id)
-            else:
-                raise ValueError(f"Unsupported context parallel model plan type: {type(cp_model_plan)}")
-            registry = HookRegistry.check_if_exists_or_initialize(m)
-            registry.register_hook(hook, hook_name)
-
-
-def remove_context_parallel(module: torch.nn.Module, plan: Dict[str, ContextParallelModelPlan]) -> None:
-    for module_id, cp_model_plan in plan.items():
-        submodule = _get_submodule_by_name(module, module_id)
-        if not isinstance(submodule, list):
-            submodule = [submodule]
-
-        for m in submodule:
-            registry = HookRegistry.check_if_exists_or_initialize(m)
-            if isinstance(cp_model_plan, dict):
-                hook_name = _CONTEXT_PARALLEL_INPUT_HOOK_TEMPLATE.format(module_id)
-            elif isinstance(cp_model_plan, (ContextParallelOutput, list, tuple)):
-                hook_name = _CONTEXT_PARALLEL_OUTPUT_HOOK_TEMPLATE.format(module_id)
-            else:
-                raise ValueError(f"Unsupported context parallel model plan type: {type(cp_model_plan)}")
-            registry.remove_hook(hook_name)
-
-
-class ContextParallelSplitHook(ModelHook):
-    def __init__(self, metadata: ContextParallelModelPlan, parallel_config: ContextParallelConfig) -> None:
-        super().__init__()
-        self.metadata = metadata
-        self.parallel_config = parallel_config
-        self.module_forward_metadata = None
-
-    def initialize_hook(self, module):
-        cls = unwrap_module(module).__class__
-        self.module_forward_metadata = ModuleForwardMetadata(_cls=cls)
-        return module
-
-    def pre_forward(self, module, *args, **kwargs):
-        args_list = list(args)
-
-        for name, cpm in self.metadata.items():
-            if isinstance(cpm, ContextParallelInput) and cpm.split_output:
-                continue
-
-            # Maybe the parameter was passed as a keyword argument
-            input_val, is_kwarg, index = self.module_forward_metadata._get_parameter_from_args_kwargs(
-                name, args_list, kwargs
-            )
-
-            if input_val is None:
-                continue
-
-            # The input_val may be a tensor or list/tuple of tensors. In certain cases, user may specify to shard
-            # the output instead of input for a particular layer by setting split_output=True
-            if isinstance(input_val, torch.Tensor):
-                input_val = self._prepare_cp_input(input_val, cpm)
-            elif isinstance(input_val, (list, tuple)):
-                if len(input_val) != len(cpm):
-                    raise ValueError(
-                        f"Expected input model plan to have {len(input_val)} elements, but got {len(cpm)}."
-                    )
-                sharded_input_val = []
-                for i, x in enumerate(input_val):
-                    if torch.is_tensor(x) and not cpm[i].split_output:
-                        x = self._prepare_cp_input(x, cpm[i])
-                    sharded_input_val.append(x)
-                input_val = sharded_input_val
-            else:
-                raise ValueError(f"Unsupported input type: {type(input_val)}")
-
-            if is_kwarg:
-                kwargs[name] = input_val
-            elif index is not None and index < len(args_list):
-                args_list[index] = input_val
-            else:
-                raise ValueError(
-                    f"An unexpected error occurred while processing the input '{name}'. Please open an "
-                    f"issue at https://github.com/huggingface/diffusers/issues and provide a minimal reproducible "
-                    f"example along with the full stack trace."
-                )
-
-        return tuple(args_list), kwargs
-
-    def post_forward(self, module, output):
-        is_tensor = isinstance(output, torch.Tensor)
-        is_tensor_list = isinstance(output, (list, tuple)) and all(isinstance(x, torch.Tensor) for x in output)
-
-        if not is_tensor and not is_tensor_list:
-            raise ValueError(f"Expected output to be a tensor or a list/tuple of tensors, but got {type(output)}.")
-
-        output = [output] if is_tensor else list(output)
-        for index, cpm in self.metadata.items():
-            if not isinstance(cpm, ContextParallelInput) or not cpm.split_output:
-                continue
-            if index >= len(output):
-                raise ValueError(f"Index {index} out of bounds for output of length {len(output)}.")
-            current_output = output[index]
-            current_output = self._prepare_cp_input(current_output, cpm)
-            output[index] = current_output
-
-        return output[0] if is_tensor else tuple(output)
-
-    def _prepare_cp_input(self, x: torch.Tensor, cp_input: ContextParallelInput) -> torch.Tensor:
-        if cp_input.expected_dims is not None and x.dim() != cp_input.expected_dims:
-            raise ValueError(
-                f"Expected input tensor to have {cp_input.expected_dims} dimensions, but got {x.dim()} dimensions."
-            )
-        return EquipartitionSharder.shard(x, cp_input.split_dim, self.parallel_config._flattened_mesh)
-
-
-class ContextParallelGatherHook(ModelHook):
-    def __init__(self, metadata: ContextParallelModelPlan, parallel_config: ContextParallelConfig) -> None:
-        super().__init__()
-        self.metadata = metadata
-        self.parallel_config = parallel_config
-
-    def post_forward(self, module, output):
-        is_tensor = isinstance(output, torch.Tensor)
-
-        if is_tensor:
-            output = [output]
-        elif not (isinstance(output, (list, tuple)) and all(isinstance(x, torch.Tensor) for x in output)):
-            raise ValueError(f"Expected output to be a tensor or a list/tuple of tensors, but got {type(output)}.")
-
-        output = list(output)
-
-        if len(output) != len(self.metadata):
-            raise ValueError(f"Expected output to have {len(self.metadata)} elements, but got {len(output)}.")
-
-        for i, cpm in enumerate(self.metadata):
-            if cpm is None:
-                continue
-            output[i] = EquipartitionSharder.unshard(output[i], cpm.gather_dim, self.parallel_config._flattened_mesh)
-
-        return output[0] if is_tensor else tuple(output)
-
-
-class AllGatherFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, tensor, dim, group):
-        ctx.dim = dim
-        ctx.group = group
-        ctx.world_size = torch.distributed.get_world_size(group)
-        ctx.rank = torch.distributed.get_rank(group)
-        return funcol.all_gather_tensor(tensor, dim, group=group)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        grad_chunks = torch.chunk(grad_output, ctx.world_size, dim=ctx.dim)
-        return grad_chunks[ctx.rank], None, None
-
-
-class EquipartitionSharder:
-    @classmethod
-    def shard(cls, tensor: torch.Tensor, dim: int, mesh: torch.distributed.device_mesh.DeviceMesh) -> torch.Tensor:
-        # NOTE: the following assertion does not have to be true in general. We simply enforce it for now
-        # because the alternate case has not yet been tested/required for any model.
-        assert tensor.size()[dim] % mesh.size() == 0, (
-            "Tensor size along dimension to be sharded must be divisible by mesh size"
-        )
-
-        # The following is not fullgraph compatible with Dynamo (fails in DeviceMesh.get_rank)
-        # return tensor.chunk(mesh.size(), dim=dim)[mesh.get_rank()]
-
-        return tensor.chunk(mesh.size(), dim=dim)[torch.distributed.get_rank(mesh.get_group())]
-
-    @classmethod
-    def unshard(cls, tensor: torch.Tensor, dim: int, mesh: torch.distributed.device_mesh.DeviceMesh) -> torch.Tensor:
-        tensor = tensor.contiguous()
-        tensor = AllGatherFunction.apply(tensor, dim, mesh.get_group())
-        return tensor
-
-
-def _get_submodule_by_name(model: torch.nn.Module, name: str) -> Union[torch.nn.Module, List[torch.nn.Module]]:
-    if name.count("*") > 1:
-        raise ValueError("Wildcard '*' can only be used once in the name")
-    return _find_submodule_by_name(model, name)
-
-
-def _find_submodule_by_name(model: torch.nn.Module, name: str) -> Union[torch.nn.Module, List[torch.nn.Module]]:
-    if name == "":
-        return model
-    first_atom, remaining_name = name.split(".", 1) if "." in name else (name, "")
-    if first_atom == "*":
-        if not isinstance(model, torch.nn.ModuleList):
-            raise ValueError("Wildcard '*' can only be used with ModuleList")
-        submodules = []
-        for submodule in model:
-            subsubmodules = _find_submodule_by_name(submodule, remaining_name)
-            if not isinstance(subsubmodules, list):
-                subsubmodules = [subsubmodules]
-            submodules.extend(subsubmodules)
-        return submodules
-    else:
-        if hasattr(model, first_atom):
-            submodule = getattr(model, first_atom)
-            return _find_submodule_by_name(submodule, remaining_name)
-        else:
-            raise ValueError(f"'{first_atom}' is not a submodule of '{model.__class__.__name__}'")
@@ -54,11 +54,11 @@ class FasterCacheConfig:
    Attributes:
        spatial_attention_block_skip_range (`int`, defaults to `2`):
            Calculate the attention states every `N` iterations. If this is set to `N`, the attention computation will
-            be skipped `N - 1` times (i.e., cached attention states will be reused) before computing the new attention
+            be skipped `N - 1` times (i.e., cached attention states will be re-used) before computing the new attention
            states again.
        temporal_attention_block_skip_range (`int`, *optional*, defaults to `None`):
            Calculate the attention states every `N` iterations. If this is set to `N`, the attention computation will
-            be skipped `N - 1` times (i.e., cached attention states will be reused) before computing the new attention
+            be skipped `N - 1` times (i.e., cached attention states will be re-used) before computing the new attention
            states again.
        spatial_attention_timestep_skip_range (`Tuple[float, float]`, defaults to `(-1, 681)`):
            The timestep range within which the spatial attention computation can be skipped without a significant loss
@@ -90,7 +90,7 @@ class FasterCacheConfig:
            from the conditional branch outputs.
        unconditional_batch_skip_range (`int`, defaults to `5`):
            Process the unconditional branch every `N` iterations. If this is set to `N`, the unconditional branch
-            computation will be skipped `N - 1` times (i.e., cached unconditional branch states will be reused) before
+            computation will be skipped `N - 1` times (i.e., cached unconditional branch states will be re-used) before
            computing the new unconditional branch states again.
        unconditional_batch_timestep_skip_range (`Tuple[float, float]`, defaults to `(-1, 641)`):
            The timestep range within which the unconditional branch computation can be skipped without a significant
@@ -45,15 +45,15 @@ class PyramidAttentionBroadcastConfig:
        spatial_attention_block_skip_range (`int`, *optional*, defaults to `None`):
            The number of times a specific spatial attention broadcast is skipped before computing the attention states
            to re-use. If this is set to the value `N`, the attention computation will be skipped `N - 1` times (i.e.,
-            old attention states will be reused) before computing the new attention states again.
+            old attention states will be re-used) before computing the new attention states again.
        temporal_attention_block_skip_range (`int`, *optional*, defaults to `None`):
            The number of times a specific temporal attention broadcast is skipped before computing the attention
            states to re-use. If this is set to the value `N`, the attention computation will be skipped `N - 1` times
-            (i.e., old attention states will be reused) before computing the new attention states again.
+            (i.e., old attention states will be re-used) before computing the new attention states again.
        cross_attention_block_skip_range (`int`, *optional*, defaults to `None`):
            The number of times a specific cross-attention broadcast is skipped before computing the attention states
            to re-use. If this is set to the value `N`, the attention computation will be skipped `N - 1` times (i.e.,
-            old attention states will be reused) before computing the new attention states again.
+            old attention states will be re-used) before computing the new attention states again.
        spatial_attention_timestep_skip_range (`Tuple[int, int]`, defaults to `(100, 800)`):
            The range of timesteps to skip in the spatial attention layer. The attention computations will be
            conditionally skipped if the current timestep is within the specified range.
@@ -305,7 +305,7 @@ def _apply_pyramid_attention_broadcast_hook(
        block_skip_range (`int`):
            The number of times a specific attention broadcast is skipped before computing the attention states to
            re-use. If this is set to the value `N`, the attention computation will be skipped `N - 1` times (i.e., old
-            attention states will be reused) before computing the new attention states again.
+            attention states will be re-used) before computing the new attention states again.
        current_timestep_callback (`Callable[[], int]`):
            A callback function that returns the current inference timestep.
    """
@@ -523,7 +523,6 @@ class VaeImageProcessor(ConfigMixin):
                size=(height, width),
            )
            image = self.pt_to_numpy(image)
-
        return image

    def binarize(self, image: PIL.Image.Image) -> PIL.Image.Image:
@@ -839,137 +838,6 @@ class VaeImageProcessor(ConfigMixin):
        return image


-class InpaintProcessor(ConfigMixin):
-    """
-    Image processor for inpainting image and mask.
-    """
-
-    config_name = CONFIG_NAME
-
-    @register_to_config
-    def __init__(
-        self,
-        do_resize: bool = True,
-        vae_scale_factor: int = 8,
-        vae_latent_channels: int = 4,
-        resample: str = "lanczos",
-        reducing_gap: int = None,
-        do_normalize: bool = True,
-        do_binarize: bool = False,
-        do_convert_grayscale: bool = False,
-        mask_do_normalize: bool = False,
-        mask_do_binarize: bool = True,
-        mask_do_convert_grayscale: bool = True,
-    ):
-        super().__init__()
-
-        self._image_processor = VaeImageProcessor(
-            do_resize=do_resize,
-            vae_scale_factor=vae_scale_factor,
-            vae_latent_channels=vae_latent_channels,
-            resample=resample,
-            reducing_gap=reducing_gap,
-            do_normalize=do_normalize,
-            do_binarize=do_binarize,
-            do_convert_grayscale=do_convert_grayscale,
-        )
-        self._mask_processor = VaeImageProcessor(
-            do_resize=do_resize,
-            vae_scale_factor=vae_scale_factor,
-            vae_latent_channels=vae_latent_channels,
-            resample=resample,
-            reducing_gap=reducing_gap,
-            do_normalize=mask_do_normalize,
-            do_binarize=mask_do_binarize,
-            do_convert_grayscale=mask_do_convert_grayscale,
-        )
-
-    def preprocess(
-        self,
-        image: PIL.Image.Image,
-        mask: PIL.Image.Image = None,
-        height: int = None,
-        width: int = None,
-        padding_mask_crop: Optional[int] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Preprocess the image and mask.
-        """
-        if mask is None and padding_mask_crop is not None:
-            raise ValueError("mask must be provided if padding_mask_crop is provided")
-
-        # if mask is None, same behavior as regular image processor
-        if mask is None:
-            return self._image_processor.preprocess(image, height=height, width=width)
-
-        if padding_mask_crop is not None:
-            crops_coords = self._image_processor.get_crop_region(mask, width, height, pad=padding_mask_crop)
-            resize_mode = "fill"
-        else:
-            crops_coords = None
-            resize_mode = "default"
-
-        processed_image = self._image_processor.preprocess(
-            image,
-            height=height,
-            width=width,
-            crops_coords=crops_coords,
-            resize_mode=resize_mode,
-        )
-
-        processed_mask = self._mask_processor.preprocess(
-            mask,
-            height=height,
-            width=width,
-            resize_mode=resize_mode,
-            crops_coords=crops_coords,
-        )
-
-        if crops_coords is not None:
-            postprocessing_kwargs = {
-                "crops_coords": crops_coords,
-                "original_image": image,
-                "original_mask": mask,
-            }
-        else:
-            postprocessing_kwargs = {
-                "crops_coords": None,
-                "original_image": None,
-                "original_mask": None,
-            }
-
-        return processed_image, processed_mask, postprocessing_kwargs
-
-    def postprocess(
-        self,
-        image: torch.Tensor,
-        output_type: str = "pil",
-        original_image: Optional[PIL.Image.Image] = None,
-        original_mask: Optional[PIL.Image.Image] = None,
-        crops_coords: Optional[Tuple[int, int, int, int]] = None,
-    ) -> Tuple[PIL.Image.Image, PIL.Image.Image]:
-        """
-        Postprocess the image, optionally apply mask overlay
-        """
-        image = self._image_processor.postprocess(
-            image,
-            output_type=output_type,
-        )
-        # optionally apply the mask overlay
-        if crops_coords is not None and (original_image is None or original_mask is None):
-            raise ValueError("original_image and original_mask must be provided if crops_coords is provided")
-
-        elif crops_coords is not None and output_type != "pil":
-            raise ValueError("output_type must be 'pil' if crops_coords is provided")
-
-        elif crops_coords is not None:
-            image = [
-                self._image_processor.apply_overlay(original_mask, original_image, i, crops_coords) for i in image
-            ]
-
-        return image
-
-
 class VaeImageProcessorLDM3D(VaeImageProcessor):
    """
    Image processor for VAE LDM3D.
@@ -1064,41 +1064,6 @@ class LoraBaseMixin:
        save_function(state_dict, save_path)
        logger.info(f"Model weights saved in {save_path}")

-    @classmethod
-    def _save_lora_weights(
-        cls,
-        save_directory: Union[str, os.PathLike],
-        lora_layers: Dict[str, Dict[str, Union[torch.nn.Module, torch.Tensor]]],
-        lora_metadata: Dict[str, Optional[dict]],
-        is_main_process: bool = True,
-        weight_name: str = None,
-        save_function: Callable = None,
-        safe_serialization: bool = True,
-    ):
-        """
-        Helper method to pack and save LoRA weights and metadata. This method centralizes the saving logic for all
-        pipeline types.
-        """
-        state_dict = {}
-        final_lora_adapter_metadata = {}
-
-        for prefix, layers in lora_layers.items():
-            state_dict.update(cls.pack_weights(layers, prefix))
-
-        for prefix, metadata in lora_metadata.items():
-            if metadata:
-                final_lora_adapter_metadata.update(_pack_dict_with_prefix(metadata, prefix))
-
-        cls.write_lora_layers(
-            state_dict=state_dict,
-            save_directory=save_directory,
-            is_main_process=is_main_process,
-            weight_name=weight_name,
-            save_function=save_function,
-            safe_serialization=safe_serialization,
-            lora_adapter_metadata=final_lora_adapter_metadata if final_lora_adapter_metadata else None,
-        )
-
    @classmethod
    def _optionally_disable_offloading(cls, _pipeline):
        return _func_optionally_disable_offloading(_pipeline=_pipeline)
@@ -558,62 +558,70 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
                    ait_sd[target_key] = value

        if any("guidance_in" in k for k in sds_sd):
-            _convert_to_ai_toolkit(
+            assign_remaining_weights(
+                [
+                    (
+                        "time_text_embed.guidance_embedder.linear_1.{lora_key}.weight",
+                        "lora_unet_guidance_in_in_layer.{orig_lora_key}.weight",
+                        None,
+                    ),
+                    (
+                        "time_text_embed.guidance_embedder.linear_2.{lora_key}.weight",
+                        "lora_unet_guidance_in_out_layer.{orig_lora_key}.weight",
+                        None,
+                    ),
+                ],
                sds_sd,
-                ait_sd,
-                "lora_unet_guidance_in_in_layer",
-                "time_text_embed.guidance_embedder.linear_1",
-            )
-
-            _convert_to_ai_toolkit(
-                sds_sd,
-                ait_sd,
-                "lora_unet_guidance_in_out_layer",
-                "time_text_embed.guidance_embedder.linear_2",
            )

        if any("img_in" in k for k in sds_sd):
-            _convert_to_ai_toolkit(
+            assign_remaining_weights(
+                [
+                    ("x_embedder.{lora_key}.weight", "lora_unet_img_in.{orig_lora_key}.weight", None),
+                ],
                sds_sd,
-                ait_sd,
-                "lora_unet_img_in",
-                "x_embedder",
            )

        if any("txt_in" in k for k in sds_sd):
-            _convert_to_ai_toolkit(
+            assign_remaining_weights(
+                [
+                    ("context_embedder.{lora_key}.weight", "lora_unet_txt_in.{orig_lora_key}.weight", None),
+                ],
                sds_sd,
-                ait_sd,
-                "lora_unet_txt_in",
-                "context_embedder",
            )

        if any("time_in" in k for k in sds_sd):
-            _convert_to_ai_toolkit(
+            assign_remaining_weights(
+                [
+                    (
+                        "time_text_embed.timestep_embedder.linear_1.{lora_key}.weight",
+                        "lora_unet_time_in_in_layer.{orig_lora_key}.weight",
+                        None,
+                    ),
+                    (
+                        "time_text_embed.timestep_embedder.linear_2.{lora_key}.weight",
+                        "lora_unet_time_in_out_layer.{orig_lora_key}.weight",
+                        None,
+                    ),
+                ],
                sds_sd,
-                ait_sd,
-                "lora_unet_time_in_in_layer",
-                "time_text_embed.timestep_embedder.linear_1",
-            )
-            _convert_to_ai_toolkit(
-                sds_sd,
-                ait_sd,
-                "lora_unet_time_in_out_layer",
-                "time_text_embed.timestep_embedder.linear_2",
            )

        if any("vector_in" in k for k in sds_sd):
-            _convert_to_ai_toolkit(
+            assign_remaining_weights(
+                [
+                    (
+                        "time_text_embed.text_embedder.linear_1.{lora_key}.weight",
+                        "lora_unet_vector_in_in_layer.{orig_lora_key}.weight",
+                        None,
+                    ),
+                    (
+                        "time_text_embed.text_embedder.linear_2.{lora_key}.weight",
+                        "lora_unet_vector_in_out_layer.{orig_lora_key}.weight",
+                        None,
+                    ),
+                ],
                sds_sd,
-                ait_sd,
-                "lora_unet_vector_in_in_layer",
-                "time_text_embed.text_embedder.linear_1",
-            )
-            _convert_to_ai_toolkit(
-                sds_sd,
-                ait_sd,
-                "lora_unet_vector_in_out_layer",
-                "time_text_embed.text_embedder.linear_2",
            )

        if any("final_layer" in k for k in sds_sd):
@@ -2121,10 +2129,6 @@ def _convert_non_diffusers_ltxv_lora_to_diffusers(state_dict, non_diffusers_pref


 def _convert_non_diffusers_qwen_lora_to_diffusers(state_dict):
-    has_diffusion_model = any(k.startswith("diffusion_model.") for k in state_dict)
-    if has_diffusion_model:
-        state_dict = {k.removeprefix("diffusion_model."): v for k, v in state_dict.items()}
-
    has_lora_unet = any(k.startswith("lora_unet_") for k in state_dict)
    if has_lora_unet:
        state_dict = {k.removeprefix("lora_unet_"): v for k, v in state_dict.items()}
@@ -2197,44 +2201,29 @@ def _convert_non_diffusers_qwen_lora_to_diffusers(state_dict):
    all_keys = list(state_dict.keys())
    down_key = ".lora_down.weight"
    up_key = ".lora_up.weight"
-    a_key = ".lora_A.weight"
-    b_key = ".lora_B.weight"

-    has_non_diffusers_lora_id = any(down_key in k or up_key in k for k in all_keys)
-    has_diffusers_lora_id = any(a_key in k or b_key in k for k in all_keys)
+    def get_alpha_scales(down_weight, alpha_key):
+        rank = down_weight.shape[0]
+        alpha = state_dict.pop(alpha_key).item()
+        scale = alpha / rank  # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
+        scale_down = scale
+        scale_up = 1.0
+        while scale_down * 2 < scale_up:
+            scale_down *= 2
+            scale_up /= 2
+        return scale_down, scale_up

-    if has_non_diffusers_lora_id:
+    for k in all_keys:
+        if k.endswith(down_key):
+            diffusers_down_key = k.replace(down_key, ".lora_A.weight")
+            diffusers_up_key = k.replace(down_key, up_key).replace(up_key, ".lora_B.weight")
+            alpha_key = k.replace(down_key, ".alpha")

-        def get_alpha_scales(down_weight, alpha_key):
-            rank = down_weight.shape[0]
-            alpha = state_dict.pop(alpha_key).item()
-            scale = alpha / rank  # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
-            scale_down = scale
-            scale_up = 1.0
-            while scale_down * 2 < scale_up:
-                scale_down *= 2
-                scale_up /= 2
-            return scale_down, scale_up
-
-        for k in all_keys:
-            if k.endswith(down_key):
-                diffusers_down_key = k.replace(down_key, ".lora_A.weight")
-                diffusers_up_key = k.replace(down_key, up_key).replace(up_key, ".lora_B.weight")
-                alpha_key = k.replace(down_key, ".alpha")
-
-                down_weight = state_dict.pop(k)
-                up_weight = state_dict.pop(k.replace(down_key, up_key))
-                scale_down, scale_up = get_alpha_scales(down_weight, alpha_key)
-                converted_state_dict[diffusers_down_key] = down_weight * scale_down
-                converted_state_dict[diffusers_up_key] = up_weight * scale_up
-
-    # Already in diffusers format (lora_A/lora_B), just pop
-    elif has_diffusers_lora_id:
-        for k in all_keys:
-            if a_key in k or b_key in k:
-                converted_state_dict[k] = state_dict.pop(k)
-            elif ".alpha" in k:
-                state_dict.pop(k)
+            down_weight = state_dict.pop(k)
+            up_weight = state_dict.pop(k.replace(down_key, up_key))
+            scale_down, scale_up = get_alpha_scales(down_weight, alpha_key)
+            converted_state_dict[diffusers_down_key] = down_weight * scale_down
+            converted_state_dict[diffusers_up_key] = up_weight * scale_up

    if len(state_dict) > 0:
        raise ValueError(f"`state_dict` should be empty at this point but has {state_dict.keys()=}")
@@ -22,7 +22,6 @@ from huggingface_hub.utils import validate_hf_hub_args
 from typing_extensions import Self

 from .. import __version__
-from ..models.model_loading_utils import _caching_allocator_warmup, _determine_device_map, _expand_device_map
 from ..quantizers import DiffusersAutoQuantizer
 from ..utils import deprecate, is_accelerate_available, is_torch_version, logging
 from ..utils.torch_utils import empty_device_cache
@@ -298,7 +297,6 @@ class FromOriginalModelMixin:
        low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
        device = kwargs.pop("device", None)
        disable_mmap = kwargs.pop("disable_mmap", False)
-        device_map = kwargs.pop("device_map", None)

        user_agent = {"diffusers": __version__, "file_type": "single_file", "framework": "pytorch"}
        # In order to ensure popular quantization methods are supported. Can be disable with `disable_telemetry`
@@ -405,8 +403,19 @@ class FromOriginalModelMixin:
        with ctx():
            model = cls.from_config(diffusers_model_config)

-        model_state_dict = model.state_dict()
+        checkpoint_mapping_kwargs = _get_mapping_function_kwargs(checkpoint_mapping_fn, **kwargs)

+        if _should_convert_state_dict_to_diffusers(model.state_dict(), checkpoint):
+            diffusers_format_checkpoint = checkpoint_mapping_fn(
+                config=diffusers_model_config, checkpoint=checkpoint, **checkpoint_mapping_kwargs
+            )
+        else:
+            diffusers_format_checkpoint = checkpoint
+
+        if not diffusers_format_checkpoint:
+            raise SingleFileComponentError(
+                f"Failed to load {mapping_class_name}. Weights for this component appear to be missing in the checkpoint."
+            )
        # Check if `_keep_in_fp32_modules` is not None
        use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (
            (torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules")
@@ -419,26 +428,6 @@ class FromOriginalModelMixin:
        else:
            keep_in_fp32_modules = []

-        # Now that the model is loaded, we can determine the `device_map`
-        device_map = _determine_device_map(model, device_map, None, torch_dtype, keep_in_fp32_modules, hf_quantizer)
-        if device_map is not None:
-            expanded_device_map = _expand_device_map(device_map, model_state_dict.keys())
-            _caching_allocator_warmup(model, expanded_device_map, torch_dtype, hf_quantizer)
-
-        checkpoint_mapping_kwargs = _get_mapping_function_kwargs(checkpoint_mapping_fn, **kwargs)
-
-        if _should_convert_state_dict_to_diffusers(model_state_dict, checkpoint):
-            diffusers_format_checkpoint = checkpoint_mapping_fn(
-                config=diffusers_model_config, checkpoint=checkpoint, **checkpoint_mapping_kwargs
-            )
-        else:
-            diffusers_format_checkpoint = checkpoint
-
-        if not diffusers_format_checkpoint:
-            raise SingleFileComponentError(
-                f"Failed to load {mapping_class_name}. Weights for this component appear to be missing in the checkpoint."
-            )
-
        if hf_quantizer is not None:
            hf_quantizer.preprocess_model(
                model=model,
@@ -25,7 +25,6 @@ from ..utils import (
 _import_structure = {}

 if is_torch_available():
-    _import_structure["_modeling_parallel"] = ["ContextParallelConfig", "ParallelConfig"]
    _import_structure["adapter"] = ["MultiAdapter", "T2IAdapter"]
    _import_structure["attention_dispatch"] = ["AttentionBackendName", "attention_backend"]
    _import_structure["auto_model"] = ["AutoModel"]
@@ -120,7 +119,6 @@ if is_flax_available():

 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    if is_torch_available():
-        from ._modeling_parallel import ContextParallelConfig, ParallelConfig
        from .adapter import MultiAdapter, T2IAdapter
        from .attention_dispatch import AttentionBackendName, attention_backend
        from .auto_model import AutoModel
@@ -1,241 +0,0 @@
-# 🚨🚨🚨 Experimental parallelism support for Diffusers 🚨🚨🚨
-# Experimental changes are subject to change and APIs may break without warning.
-
-# Copyright 2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Tuple, Union
-
-import torch
-
-from ..utils import get_logger
-
-
-if TYPE_CHECKING:
-    pass
-
-
-logger = get_logger(__name__)  # pylint: disable=invalid-name
-
-
-# TODO(aryan): add support for the following:
-# - Unified Attention
-# - More dispatcher attention backends
-# - CFG/Data Parallel
-# - Tensor Parallel
-
-
-@dataclass
-class ContextParallelConfig:
-    """
-    Configuration for context parallelism.
-
-    Args:
-        ring_degree (`int`, *optional*, defaults to `1`):
-            Number of devices to use for ring attention within a context parallel region. Must be a divisor of the
-            total number of devices in the context parallel mesh.
-        ulysses_degree (`int`, *optional*, defaults to `1`):
-            Number of devices to use for ulysses attention within a context parallel region. Must be a divisor of the
-            total number of devices in the context parallel mesh.
-        convert_to_fp32 (`bool`, *optional*, defaults to `True`):
-            Whether to convert output and LSE to float32 for ring attention numerical stability.
-        rotate_method (`str`, *optional*, defaults to `"allgather"`):
-            Method to use for rotating key/value states across devices in ring attention. Currently, only `"allgather"`
-            is supported.
-
-    """
-
-    ring_degree: Optional[int] = None
-    ulysses_degree: Optional[int] = None
-    convert_to_fp32: bool = True
-    # TODO: support alltoall
-    rotate_method: Literal["allgather", "alltoall"] = "allgather"
-
-    _rank: int = None
-    _world_size: int = None
-    _device: torch.device = None
-    _mesh: torch.distributed.device_mesh.DeviceMesh = None
-    _flattened_mesh: torch.distributed.device_mesh.DeviceMesh = None
-    _ring_mesh: torch.distributed.device_mesh.DeviceMesh = None
-    _ulysses_mesh: torch.distributed.device_mesh.DeviceMesh = None
-    _ring_local_rank: int = None
-    _ulysses_local_rank: int = None
-
-    def __post_init__(self):
-        if self.ring_degree is None:
-            self.ring_degree = 1
-        if self.ulysses_degree is None:
-            self.ulysses_degree = 1
-
-    def setup(self, rank: int, world_size: int, device: torch.device, mesh: torch.distributed.device_mesh.DeviceMesh):
-        self._rank = rank
-        self._world_size = world_size
-        self._device = device
-        self._mesh = mesh
-        if self.ring_degree is None:
-            self.ring_degree = 1
-        if self.ulysses_degree is None:
-            self.ulysses_degree = 1
-        if self.rotate_method != "allgather":
-            raise NotImplementedError(
-                f"Only rotate_method='allgather' is supported for now, but got {self.rotate_method}."
-            )
-        if self._flattened_mesh is None:
-            self._flattened_mesh = self._mesh._flatten()
-        if self._ring_mesh is None:
-            self._ring_mesh = self._mesh["ring"]
-        if self._ulysses_mesh is None:
-            self._ulysses_mesh = self._mesh["ulysses"]
-        if self._ring_local_rank is None:
-            self._ring_local_rank = self._ring_mesh.get_local_rank()
-        if self._ulysses_local_rank is None:
-            self._ulysses_local_rank = self._ulysses_mesh.get_local_rank()
-
-
-@dataclass
-class ParallelConfig:
-    """
-    Configuration for applying different parallelisms.
-
-    Args:
-        context_parallel_config (`ContextParallelConfig`, *optional*):
-            Configuration for context parallelism.
-    """
-
-    context_parallel_config: Optional[ContextParallelConfig] = None
-
-    _rank: int = None
-    _world_size: int = None
-    _device: torch.device = None
-    _cp_mesh: torch.distributed.device_mesh.DeviceMesh = None
-
-    def setup(
-        self,
-        rank: int,
-        world_size: int,
-        device: torch.device,
-        *,
-        cp_mesh: Optional[torch.distributed.device_mesh.DeviceMesh] = None,
-    ):
-        self._rank = rank
-        self._world_size = world_size
-        self._device = device
-        self._cp_mesh = cp_mesh
-        if self.context_parallel_config is not None:
-            self.context_parallel_config.setup(rank, world_size, device, cp_mesh)
-
-
-@dataclass(frozen=True)
-class ContextParallelInput:
-    """
-    Configuration for splitting an input tensor across context parallel region.
-
-    Args:
-        split_dim (`int`):
-            The dimension along which to split the tensor.
-        expected_dims (`int`, *optional*):
-            The expected number of dimensions of the tensor. If provided, a check will be performed to ensure that the
-            tensor has the expected number of dimensions before splitting.
-        split_output (`bool`, *optional*, defaults to `False`):
-            Whether to split the output tensor of the layer along the given `split_dim` instead of the input tensor.
-            This is useful for layers whose outputs should be split after it does some preprocessing on the inputs (ex:
-            RoPE).
-    """
-
-    split_dim: int
-    expected_dims: Optional[int] = None
-    split_output: bool = False
-
-    def __repr__(self):
-        return f"ContextParallelInput(split_dim={self.split_dim}, expected_dims={self.expected_dims}, split_output={self.split_output})"
-
-
-@dataclass(frozen=True)
-class ContextParallelOutput:
-    """
-    Configuration for gathering an output tensor across context parallel region.
-
-    Args:
-        gather_dim (`int`):
-            The dimension along which to gather the tensor.
-        expected_dims (`int`, *optional*):
-            The expected number of dimensions of the tensor. If provided, a check will be performed to ensure that the
-            tensor has the expected number of dimensions before gathering.
-    """
-
-    gather_dim: int
-    expected_dims: Optional[int] = None
-
-    def __repr__(self):
-        return f"ContextParallelOutput(gather_dim={self.gather_dim}, expected_dims={self.expected_dims})"
-
-
-# A dictionary where keys denote the input to be split across context parallel region, and the
-# value denotes the sharding configuration.
-# If the key is a string, it denotes the name of the parameter in the forward function.
-# If the key is an integer, split_output must be set to True, and it denotes the index of the output
-# to be split across context parallel region.
-ContextParallelInputType = Dict[
-    Union[str, int], Union[ContextParallelInput, List[ContextParallelInput], Tuple[ContextParallelInput, ...]]
-]
-
-# A dictionary where keys denote the output to be gathered across context parallel region, and the
-# value denotes the gathering configuration.
-ContextParallelOutputType = Union[
-    ContextParallelOutput, List[ContextParallelOutput], Tuple[ContextParallelOutput, ...]
-]
-
-# A dictionary where keys denote the module id, and the value denotes how the inputs/outputs of
-# the module should be split/gathered across context parallel region.
-ContextParallelModelPlan = Dict[str, Union[ContextParallelInputType, ContextParallelOutputType]]
-
-
-# Example of a ContextParallelModelPlan (QwenImageTransformer2DModel):
-#
-# Each model should define a _cp_plan attribute that contains information on how to shard/gather
-# tensors at different stages of the forward:
-#
-# ```python
-# _cp_plan = {
-#     "": {
-#         "hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False),
-#         "encoder_hidden_states": ContextParallelInput(split_dim=1, expected_dims=3, split_output=False),
-#         "encoder_hidden_states_mask": ContextParallelInput(split_dim=1, expected_dims=2, split_output=False),
-#     },
-#     "pos_embed": {
-#         0: ContextParallelInput(split_dim=0, expected_dims=2, split_output=True),
-#         1: ContextParallelInput(split_dim=0, expected_dims=2, split_output=True),
-#     },
-#     "proj_out": ContextParallelOutput(gather_dim=1, expected_dims=3),
-# }
-# ```
-#
-# The dictionary is a set of module names mapped to their respective CP plan. The inputs/outputs of layers will be
-# split/gathered according to this at the respective module level. Here, the following happens:
-# - "":
-#     we specify that we want to split the various inputs across the sequence dim in the pre-forward hook (i.e. before
-#     the actual forward logic of the QwenImageTransformer2DModel is run, we will splitthe inputs)
-# - "pos_embed":
-#     we specify that we want to split the outputs of the RoPE layer. Since there are two outputs (imag & text freqs),
-#     we can individually specify how they should be split
-# - "proj_out":
-#     before returning to the user, we gather the entire sequence on each rank in the post-forward hook (after the linear
-#     layer forward has run).
-#
-# ContextParallelInput:
-#     specifies how to split the input tensor in the pre-forward or post-forward hook of the layer it is attached to
-#
-# ContextParallelOutput:
-#     specifies how to gather the input tensor in the post-forward hook in the layer it is attached to
@@ -241,7 +241,7 @@ class AttentionModuleMixin:
                            op_fw, op_bw = attention_op
                            dtype, *_ = op_fw.SUPPORTED_DTYPES
                        q = torch.randn((1, 2, 40), device="cuda", dtype=dtype)
-                        _ = xops.ops.memory_efficient_attention(q, q, q)
+                        _ = xops.memory_efficient_attention(q, q, q)
                except Exception as e:
                    raise e

@@ -674,7 +674,7 @@ class JointTransformerBlock(nn.Module):
        encoder_hidden_states: torch.FloatTensor,
        temb: torch.FloatTensor,
        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ):
        joint_attention_kwargs = joint_attention_kwargs or {}
        if self.use_dual_attention:
            norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp, norm_hidden_states2, gate_msa2 = self.norm1(
@@ -19,7 +19,6 @@ from huggingface_hub.utils import validate_hf_hub_args

 from ..configuration_utils import ConfigMixin
 from ..utils import logging
-from ..utils.dynamic_modules_utils import get_class_from_dynamic_module, resolve_trust_remote_code


 logger = logging.get_logger(__name__)
@@ -115,8 +114,6 @@ class AutoModel(ConfigMixin):
            disable_mmap ('bool', *optional*, defaults to 'False'):
                Whether to disable mmap when loading a Safetensors model. This option can perform better when the model
                is on a network mount or hard drive, which may not handle the seeky-ness of mmap very well.
-            trust_remote_cocde (`bool`, *optional*, defaults to `False`):
-                Whether to trust remote code

        <Tip>

@@ -143,22 +140,22 @@ class AutoModel(ConfigMixin):
        You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
        ```
        """
+        cache_dir = kwargs.pop("cache_dir", None)
+        force_download = kwargs.pop("force_download", False)
+        proxies = kwargs.pop("proxies", None)
+        token = kwargs.pop("token", None)
+        local_files_only = kwargs.pop("local_files_only", False)
+        revision = kwargs.pop("revision", None)
        subfolder = kwargs.pop("subfolder", None)
-        trust_remote_code = kwargs.pop("trust_remote_code", False)

-        hub_kwargs_names = [
-            "cache_dir",
-            "force_download",
-            "local_files_only",
-            "proxies",
-            "resume_download",
-            "revision",
-            "token",
-        ]
-        hub_kwargs = {name: kwargs.pop(name, None) for name in hub_kwargs_names}
-
-        # load_config_kwargs uses the same hub kwargs minus subfolder and resume_download
-        load_config_kwargs = {k: v for k, v in hub_kwargs.items() if k not in ["subfolder", "resume_download"]}
+        load_config_kwargs = {
+            "cache_dir": cache_dir,
+            "force_download": force_download,
+            "proxies": proxies,
+            "token": token,
+            "local_files_only": local_files_only,
+            "revision": revision,
+        }

        library = None
        orig_class_name = None
@@ -192,35 +189,15 @@ class AutoModel(ConfigMixin):
            else:
                raise ValueError(f"Couldn't find model associated with the config file at {pretrained_model_or_path}.")

-        has_remote_code = "auto_map" in config and cls.__name__ in config["auto_map"]
-        trust_remote_code = resolve_trust_remote_code(trust_remote_code, pretrained_model_or_path, has_remote_code)
-        if not has_remote_code and trust_remote_code:
-            raise ValueError(
-                "Selected model repository does not happear to have any custom code or does not have a valid `config.json` file."
-            )
+        from ..pipelines.pipeline_loading_utils import ALL_IMPORTABLE_CLASSES, get_class_obj_and_candidates

-        if has_remote_code and trust_remote_code:
-            class_ref = config["auto_map"][cls.__name__]
-            module_file, class_name = class_ref.split(".")
-            module_file = module_file + ".py"
-            model_cls = get_class_from_dynamic_module(
-                pretrained_model_or_path,
-                subfolder=subfolder,
-                module_file=module_file,
-                class_name=class_name,
-                **hub_kwargs,
-                **kwargs,
-            )
-        else:
-            from ..pipelines.pipeline_loading_utils import ALL_IMPORTABLE_CLASSES, get_class_obj_and_candidates
-
-            model_cls, _ = get_class_obj_and_candidates(
-                library_name=library,
-                class_name=orig_class_name,
-                importable_classes=ALL_IMPORTABLE_CLASSES,
-                pipelines=None,
-                is_pipeline_module=False,
-            )
+        model_cls, _ = get_class_obj_and_candidates(
+            library_name=library,
+            class_name=orig_class_name,
+            importable_classes=ALL_IMPORTABLE_CLASSES,
+            pipelines=None,
+            is_pipeline_module=False,
+        )

        if model_cls is None:
            raise ValueError(f"AutoModel can't find a model linked to {orig_class_name}.")
@@ -617,7 +617,7 @@ class AutoencoderDC(ModelMixin, ConfigMixin, FromOriginalModelMixin):
                returned.
        """
        if self.use_slicing and z.size(0) > 1:
-            decoded_slices = [self._decode(z_slice) for z_slice in z.split(1)]
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
            decoded = torch.cat(decoded_slices)
        else:
            decoded = self._decode(z)
@@ -1052,7 +1052,7 @@ class AutoencoderKLWan(ModelMixin, ConfigMixin, FromOriginalModelMixin):
            is_residual=is_residual,
        )

-        self.spatial_compression_ratio = scale_factor_spatial
+        self.spatial_compression_ratio = 2 ** len(self.temperal_downsample)

        # When decoding a batch of video latents at a time, one can save memory by slicing across the batch dimension
        # to perform decoding of a single video latent at a time.
@@ -1145,13 +1145,12 @@ class AutoencoderKLWan(ModelMixin, ConfigMixin, FromOriginalModelMixin):
    def _encode(self, x: torch.Tensor):
        _, _, num_frame, height, width = x.shape

-        self.clear_cache()
-        if self.config.patch_size is not None:
-            x = patchify(x, patch_size=self.config.patch_size)
-
        if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
            return self.tiled_encode(x)

+        self.clear_cache()
+        if self.config.patch_size is not None:
+            x = patchify(x, patch_size=self.config.patch_size)
        iter_ = 1 + (num_frame - 1) // 4
        for i in range(iter_):
            self._enc_conv_idx = [0]
@@ -0,0 +1,115 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Tuple, Union
+
+from ..utils import deprecate
+from .controlnets.controlnet import (  # noqa
+    ControlNetConditioningEmbedding,
+    ControlNetModel,
+    ControlNetOutput,
+    zero_module,
+)
+
+
+class ControlNetOutput(ControlNetOutput):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `ControlNetOutput` from `diffusers.models.controlnet` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet import ControlNetOutput`, instead."
+        deprecate("diffusers.models.controlnet.ControlNetOutput", "0.34", deprecation_message)
+        super().__init__(*args, **kwargs)
+
+
+class ControlNetModel(ControlNetModel):
+    def __init__(
+        self,
+        in_channels: int = 4,
+        conditioning_channels: int = 3,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str, ...] = (
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
+        ),
+        mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn",
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
+        encoder_hid_dim: Optional[int] = None,
+        encoder_hid_dim_type: Optional[str] = None,
+        attention_head_dim: Union[int, Tuple[int, ...]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        addition_embed_type: Optional[str] = None,
+        addition_time_embed_dim: Optional[int] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        projection_class_embeddings_input_dim: Optional[int] = None,
+        controlnet_conditioning_channel_order: str = "rgb",
+        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
+        global_pool_conditions: bool = False,
+        addition_embed_type_num_heads: int = 64,
+    ):
+        deprecation_message = "Importing `ControlNetModel` from `diffusers.models.controlnet` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet import ControlNetModel`, instead."
+        deprecate("diffusers.models.controlnet.ControlNetModel", "0.34", deprecation_message)
+        super().__init__(
+            in_channels=in_channels,
+            conditioning_channels=conditioning_channels,
+            flip_sin_to_cos=flip_sin_to_cos,
+            freq_shift=freq_shift,
+            down_block_types=down_block_types,
+            mid_block_type=mid_block_type,
+            only_cross_attention=only_cross_attention,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            downsample_padding=downsample_padding,
+            mid_block_scale_factor=mid_block_scale_factor,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+            cross_attention_dim=cross_attention_dim,
+            transformer_layers_per_block=transformer_layers_per_block,
+            encoder_hid_dim=encoder_hid_dim,
+            encoder_hid_dim_type=encoder_hid_dim_type,
+            attention_head_dim=attention_head_dim,
+            num_attention_heads=num_attention_heads,
+            use_linear_projection=use_linear_projection,
+            class_embed_type=class_embed_type,
+            addition_embed_type=addition_embed_type,
+            addition_time_embed_dim=addition_time_embed_dim,
+            num_class_embeds=num_class_embeds,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            projection_class_embeddings_input_dim=projection_class_embeddings_input_dim,
+            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
+            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
+            global_pool_conditions=global_pool_conditions,
+            addition_embed_type_num_heads=addition_embed_type_num_heads,
+        )
+
+
+class ControlNetConditioningEmbedding(ControlNetConditioningEmbedding):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `ControlNetConditioningEmbedding` from `diffusers.models.controlnet` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet import ControlNetConditioningEmbedding`, instead."
+        deprecate("diffusers.models.controlnet.ControlNetConditioningEmbedding", "0.34", deprecation_message)
+        super().__init__(*args, **kwargs)
@@ -0,0 +1,70 @@
+# Copyright 2025 Black Forest Labs, The HuggingFace Team and The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import List
+
+from ..utils import deprecate, logging
+from .controlnets.controlnet_flux import FluxControlNetModel, FluxControlNetOutput, FluxMultiControlNetModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class FluxControlNetOutput(FluxControlNetOutput):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `FluxControlNetOutput` from `diffusers.models.controlnet_flux` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_flux import FluxControlNetOutput`, instead."
+        deprecate("diffusers.models.controlnet_flux.FluxControlNetOutput", "0.34", deprecation_message)
+        super().__init__(*args, **kwargs)
+
+
+class FluxControlNetModel(FluxControlNetModel):
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 64,
+        num_layers: int = 19,
+        num_single_layers: int = 38,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        guidance_embeds: bool = False,
+        axes_dims_rope: List[int] = [16, 56, 56],
+        num_mode: int = None,
+        conditioning_embedding_channels: int = None,
+    ):
+        deprecation_message = "Importing `FluxControlNetModel` from `diffusers.models.controlnet_flux` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_flux import FluxControlNetModel`, instead."
+        deprecate("diffusers.models.controlnet_flux.FluxControlNetModel", "0.34", deprecation_message)
+        super().__init__(
+            patch_size=patch_size,
+            in_channels=in_channels,
+            num_layers=num_layers,
+            num_single_layers=num_single_layers,
+            attention_head_dim=attention_head_dim,
+            num_attention_heads=num_attention_heads,
+            joint_attention_dim=joint_attention_dim,
+            pooled_projection_dim=pooled_projection_dim,
+            guidance_embeds=guidance_embeds,
+            axes_dims_rope=axes_dims_rope,
+            num_mode=num_mode,
+            conditioning_embedding_channels=conditioning_embedding_channels,
+        )
+
+
+class FluxMultiControlNetModel(FluxMultiControlNetModel):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `FluxMultiControlNetModel` from `diffusers.models.controlnet_flux` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_flux import FluxMultiControlNetModel`, instead."
+        deprecate("diffusers.models.controlnet_flux.FluxMultiControlNetModel", "0.34", deprecation_message)
+        super().__init__(*args, **kwargs)
@@ -0,0 +1,68 @@
+# Copyright 2025 Stability AI, The HuggingFace Team and The InstantX Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from ..utils import deprecate, logging
+from .controlnets.controlnet_sd3 import SD3ControlNetModel, SD3ControlNetOutput, SD3MultiControlNetModel
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class SD3ControlNetOutput(SD3ControlNetOutput):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `SD3ControlNetOutput` from `diffusers.models.controlnet_sd3` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_sd3 import SD3ControlNetOutput`, instead."
+        deprecate("diffusers.models.controlnet_sd3.SD3ControlNetOutput", "0.34", deprecation_message)
+        super().__init__(*args, **kwargs)
+
+
+class SD3ControlNetModel(SD3ControlNetModel):
+    def __init__(
+        self,
+        sample_size: int = 128,
+        patch_size: int = 2,
+        in_channels: int = 16,
+        num_layers: int = 18,
+        attention_head_dim: int = 64,
+        num_attention_heads: int = 18,
+        joint_attention_dim: int = 4096,
+        caption_projection_dim: int = 1152,
+        pooled_projection_dim: int = 2048,
+        out_channels: int = 16,
+        pos_embed_max_size: int = 96,
+        extra_conditioning_channels: int = 0,
+    ):
+        deprecation_message = "Importing `SD3ControlNetModel` from `diffusers.models.controlnet_sd3` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_sd3 import SD3ControlNetModel`, instead."
+        deprecate("diffusers.models.controlnet_sd3.SD3ControlNetModel", "0.34", deprecation_message)
+        super().__init__(
+            sample_size=sample_size,
+            patch_size=patch_size,
+            in_channels=in_channels,
+            num_layers=num_layers,
+            attention_head_dim=attention_head_dim,
+            num_attention_heads=num_attention_heads,
+            joint_attention_dim=joint_attention_dim,
+            caption_projection_dim=caption_projection_dim,
+            pooled_projection_dim=pooled_projection_dim,
+            out_channels=out_channels,
+            pos_embed_max_size=pos_embed_max_size,
+            extra_conditioning_channels=extra_conditioning_channels,
+        )
+
+
+class SD3MultiControlNetModel(SD3MultiControlNetModel):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `SD3MultiControlNetModel` from `diffusers.models.controlnet_sd3` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_sd3 import SD3MultiControlNetModel`, instead."
+        deprecate("diffusers.models.controlnet_sd3.SD3MultiControlNetModel", "0.34", deprecation_message)
+        super().__init__(*args, **kwargs)
@@ -0,0 +1,116 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import Optional, Tuple, Union
+
+from ..utils import deprecate, logging
+from .controlnets.controlnet_sparsectrl import (  # noqa
+    SparseControlNetConditioningEmbedding,
+    SparseControlNetModel,
+    SparseControlNetOutput,
+    zero_module,
+)
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+class SparseControlNetOutput(SparseControlNetOutput):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `SparseControlNetOutput` from `diffusers.models.controlnet_sparsectrl` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_sparsectrl import SparseControlNetOutput`, instead."
+        deprecate("diffusers.models.controlnet_sparsectrl.SparseControlNetOutput", "0.34", deprecation_message)
+        super().__init__(*args, **kwargs)
+
+
+class SparseControlNetConditioningEmbedding(SparseControlNetConditioningEmbedding):
+    def __init__(self, *args, **kwargs):
+        deprecation_message = "Importing `SparseControlNetConditioningEmbedding` from `diffusers.models.controlnet_sparsectrl` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_sparsectrl import SparseControlNetConditioningEmbedding`, instead."
+        deprecate(
+            "diffusers.models.controlnet_sparsectrl.SparseControlNetConditioningEmbedding", "0.34", deprecation_message
+        )
+        super().__init__(*args, **kwargs)
+
+
+class SparseControlNetModel(SparseControlNetModel):
+    def __init__(
+        self,
+        in_channels: int = 4,
+        conditioning_channels: int = 4,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str, ...] = (
+            "CrossAttnDownBlockMotion",
+            "CrossAttnDownBlockMotion",
+            "CrossAttnDownBlockMotion",
+            "DownBlockMotion",
+        ),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: Optional[int] = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 768,
+        transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
+        transformer_layers_per_mid_block: Optional[Union[int, Tuple[int]]] = None,
+        temporal_transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1,
+        attention_head_dim: Union[int, Tuple[int, ...]] = 8,
+        num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None,
+        use_linear_projection: bool = False,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+        conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256),
+        global_pool_conditions: bool = False,
+        controlnet_conditioning_channel_order: str = "rgb",
+        motion_max_seq_length: int = 32,
+        motion_num_attention_heads: int = 8,
+        concat_conditioning_mask: bool = True,
+        use_simplified_condition_embedding: bool = True,
+    ):
+        deprecation_message = "Importing `SparseControlNetModel` from `diffusers.models.controlnet_sparsectrl` is deprecated and this will be removed in a future version. Please use `from diffusers.models.controlnets.controlnet_sparsectrl import SparseControlNetModel`, instead."
+        deprecate("diffusers.models.controlnet_sparsectrl.SparseControlNetModel", "0.34", deprecation_message)
+        super().__init__(
+            in_channels=in_channels,
+            conditioning_channels=conditioning_channels,
+            flip_sin_to_cos=flip_sin_to_cos,
+            freq_shift=freq_shift,
+            down_block_types=down_block_types,
+            only_cross_attention=only_cross_attention,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            downsample_padding=downsample_padding,
+            mid_block_scale_factor=mid_block_scale_factor,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            norm_eps=norm_eps,
+            cross_attention_dim=cross_attention_dim,
+            transformer_layers_per_block=transformer_layers_per_block,
+            transformer_layers_per_mid_block=transformer_layers_per_mid_block,
+            temporal_transformer_layers_per_block=temporal_transformer_layers_per_block,
+            attention_head_dim=attention_head_dim,
+            num_attention_heads=num_attention_heads,
+            use_linear_projection=use_linear_projection,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+            conditioning_embedding_out_channels=conditioning_embedding_out_channels,
+            global_pool_conditions=global_pool_conditions,
+            controlnet_conditioning_channel_order=controlnet_conditioning_channel_order,
+            motion_max_seq_length=motion_max_seq_length,
+            motion_num_attention_heads=motion_num_attention_heads,
+            concat_conditioning_mask=concat_conditioning_mask,
+            use_simplified_condition_embedding=use_simplified_condition_embedding,
+        )
@@ -26,11 +26,11 @@ from flax.traverse_util import flatten_dict, unflatten_dict
 from huggingface_hub import create_repo, hf_hub_download
 from huggingface_hub.utils import (
    EntryNotFoundError,
-    HfHubHTTPError,
    RepositoryNotFoundError,
    RevisionNotFoundError,
    validate_hf_hub_args,
 )
+from requests import HTTPError

 from .. import __version__, is_torch_available
 from ..utils import (
@@ -385,7 +385,7 @@ class FlaxModelMixin(PushToHubMixin):
                raise EnvironmentError(
                    f"{pretrained_model_name_or_path} does not appear to have a file named {FLAX_WEIGHTS_NAME}."
                )
-            except HfHubHTTPError as err:
+            except HTTPError as err:
                raise EnvironmentError(
                    f"There was a specific connection error when trying to load {pretrained_model_name_or_path}:\n"
                    f"{err}"
@@ -65,7 +65,6 @@ from ..utils.hub_utils import (
    populate_model_card,
 )
 from ..utils.torch_utils import empty_device_cache
-from ._modeling_parallel import ContextParallelConfig, ContextParallelModelPlan, ParallelConfig
 from .model_loading_utils import (
    _caching_allocator_warmup,
    _determine_device_map,
@@ -249,8 +248,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
    _skip_layerwise_casting_patterns = None
    _supports_group_offloading = True
    _repeated_blocks = []
-    _parallel_config = None
-    _cp_plan = None

    def __init__(self):
        super().__init__()
@@ -623,8 +620,8 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):

    def reset_attention_backend(self) -> None:
        """
-        Resets the attention backend for the model. Following calls to `forward` will use the environment default, if
-        set, or the torch native scaled dot product attention.
+        Resets the attention backend for the model. Following calls to `forward` will use the environment default or
+        the torch native scaled dot product attention.
        """
        from .attention import AttentionModuleMixin
        from .attention_processor import Attention, MochiAttention
@@ -963,7 +960,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
        quantization_config = kwargs.pop("quantization_config", None)
        dduf_entries: Optional[Dict[str, DDUFEntry]] = kwargs.pop("dduf_entries", None)
        disable_mmap = kwargs.pop("disable_mmap", False)
-        parallel_config: Optional[Union[ParallelConfig, ContextParallelConfig]] = kwargs.pop("parallel_config", None)

        is_parallel_loading_enabled = HF_ENABLE_PARALLEL_LOADING
        if is_parallel_loading_enabled and not low_cpu_mem_usage:
@@ -1344,9 +1340,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
        # Set model in evaluation mode to deactivate DropOut modules by default
        model.eval()

-        if parallel_config is not None:
-            model.enable_parallelism(config=parallel_config)
-
        if output_loading_info:
            return model, loading_info

@@ -1485,73 +1478,6 @@ class ModelMixin(torch.nn.Module, PushToHubMixin):
                f"Regional compilation failed because {repeated_blocks} classes are not found in the model. "
            )

-    def enable_parallelism(
-        self,
-        *,
-        config: Union[ParallelConfig, ContextParallelConfig],
-        cp_plan: Optional[Dict[str, ContextParallelModelPlan]] = None,
-    ):
-        from ..hooks.context_parallel import apply_context_parallel
-        from .attention import AttentionModuleMixin
-        from .attention_processor import Attention, MochiAttention
-
-        logger.warning(
-            "`enable_parallelism` is an experimental feature. The API may change in the future and breaking changes may be introduced at any time without warning."
-        )
-
-        if isinstance(config, ContextParallelConfig):
-            config = ParallelConfig(context_parallel_config=config)
-
-        if not torch.distributed.is_initialized():
-            raise RuntimeError("torch.distributed must be initialized before calling `enable_parallelism`.")
-
-        rank = torch.distributed.get_rank()
-        world_size = torch.distributed.get_world_size()
-        device_type = torch._C._get_accelerator().type
-        device_module = torch.get_device_module(device_type)
-        device = torch.device(device_type, rank % device_module.device_count())
-
-        cp_mesh = None
-        if config.context_parallel_config is not None:
-            cp_config = config.context_parallel_config
-            if cp_config.ring_degree < 1 or cp_config.ulysses_degree < 1:
-                raise ValueError("`ring_degree` and `ulysses_degree` must be greater than or equal to 1.")
-            if cp_config.ring_degree > 1 and cp_config.ulysses_degree > 1:
-                raise ValueError(
-                    "Unified Ulysses-Ring attention is not yet supported. Please set either `ring_degree` or `ulysses_degree` to 1."
-                )
-            if cp_config.ring_degree * cp_config.ulysses_degree > world_size:
-                raise ValueError(
-                    f"The product of `ring_degree` ({cp_config.ring_degree}) and `ulysses_degree` ({cp_config.ulysses_degree}) must not exceed the world size ({world_size})."
-                )
-            cp_mesh = torch.distributed.device_mesh.init_device_mesh(
-                device_type=device_type,
-                mesh_shape=(cp_config.ring_degree, cp_config.ulysses_degree),
-                mesh_dim_names=("ring", "ulysses"),
-            )
-
-        config.setup(rank, world_size, device, cp_mesh=cp_mesh)
-
-        if cp_plan is None and self._cp_plan is None:
-            raise ValueError(
-                "`cp_plan` must be provided either as an argument or set in the model's `_cp_plan` attribute."
-            )
-        cp_plan = cp_plan if cp_plan is not None else self._cp_plan
-
-        if config.context_parallel_config is not None:
-            apply_context_parallel(self, config.context_parallel_config, cp_plan)
-
-        self._parallel_config = config
-
-        attention_classes = (Attention, MochiAttention, AttentionModuleMixin)
-        for module in self.modules():
-            if not isinstance(module, attention_classes):
-                continue
-            processor = module.processor
-            if processor is None or not hasattr(processor, "_parallel_config"):
-                continue
-            processor._parallel_config = config
-
    @classmethod
    def _load_pretrained_model(
        cls,
@@ -13,7 +13,7 @@
 # limitations under the License.


-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional, Union

 import torch
 import torch.nn as nn
@@ -92,7 +92,7 @@ class AuraFlowPatchEmbed(nn.Module):

        return selected_indices

-    def forward(self, latent) -> torch.Tensor:
+    def forward(self, latent):
        batch_size, num_channels, height, width = latent.size()
        latent = latent.view(
            batch_size,
@@ -173,7 +173,7 @@ class AuraFlowSingleTransformerBlock(nn.Module):
        hidden_states: torch.FloatTensor,
        temb: torch.FloatTensor,
        attention_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> torch.Tensor:
+    ):
        residual = hidden_states
        attention_kwargs = attention_kwargs or {}

@@ -242,7 +242,7 @@ class AuraFlowJointTransformerBlock(nn.Module):
        encoder_hidden_states: torch.FloatTensor,
        temb: torch.FloatTensor,
        attention_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ):
        residual = hidden_states
        residual_context = encoder_hidden_states
        attention_kwargs = attention_kwargs or {}
@@ -472,7 +472,7 @@ class AuraFlowTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, From
        timestep: torch.LongTensor = None,
        attention_kwargs: Optional[Dict[str, Any]] = None,
        return_dict: bool = True,
-    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
        if attention_kwargs is not None:
            attention_kwargs = attention_kwargs.copy()
            lora_scale = attention_kwargs.pop("scale", 1.0)
@@ -122,7 +122,7 @@ class CogVideoXBlock(nn.Module):
        temb: torch.Tensor,
        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        attention_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
        text_seq_length = encoder_hidden_states.size(1)
        attention_kwargs = attention_kwargs or {}

@@ -441,7 +441,7 @@ class CogVideoXTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Cac
        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        attention_kwargs: Optional[Dict[str, Any]] = None,
        return_dict: bool = True,
-    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
+    ):
        if attention_kwargs is not None:
            attention_kwargs = attention_kwargs.copy()
            lora_scale = attention_kwargs.pop("scale", 1.0)
@@ -315,7 +315,7 @@ class ConsisIDBlock(nn.Module):
        encoder_hidden_states: torch.Tensor,
        temb: torch.Tensor,
        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
        text_seq_length = encoder_hidden_states.size(1)

        # norm & modulate
@@ -691,7 +691,7 @@ class ConsisIDTransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
        id_cond: Optional[torch.Tensor] = None,
        id_vit_hidden: Optional[torch.Tensor] = None,
        return_dict: bool = True,
-    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
+    ):
        if attention_kwargs is not None:
            attention_kwargs = attention_kwargs.copy()
            lora_scale = attention_kwargs.pop("scale", 1.0)
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import Any, Dict, Optional, Tuple, Union
+from typing import Any, Dict, Optional

 import torch
 import torch.nn as nn
@@ -124,7 +124,7 @@ class LuminaNextDiTBlock(nn.Module):
        encoder_mask: torch.Tensor,
        temb: torch.Tensor,
        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> torch.Tensor:
+    ):
        """
        Perform a forward pass through the LuminaNextDiTBlock.

@@ -297,7 +297,7 @@ class LuminaNextDiT2DModel(ModelMixin, ConfigMixin):
        image_rotary_emb: torch.Tensor,
        cross_attention_kwargs: Dict[str, Any] = None,
        return_dict=True,
-    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
+    ) -> torch.Tensor:
        """
        Forward pass of LuminaNextDiT.

@@ -120,7 +120,6 @@ def get_1d_rotary_pos_embed(

 class BriaAttnProcessor:
    _attention_backend = None
-    _parallel_config = None

    def __init__(self):
        if not hasattr(F, "scaled_dot_product_attention"):
@@ -162,12 +161,7 @@ class BriaAttnProcessor:
            key = apply_rotary_emb(key, image_rotary_emb, sequence_dim=1)

        hidden_states = dispatch_attention_fn(
-            query,
-            key,
-            value,
-            attn_mask=attention_mask,
-            backend=self._attention_backend,
-            parallel_config=self._parallel_config,
+            query, key, value, attn_mask=attention_mask, backend=self._attention_backend
        )
        hidden_states = hidden_states.flatten(2, 3)
        hidden_states = hidden_states.to(query.dtype)
@@ -478,7 +472,7 @@ class BriaSingleTransformerBlock(nn.Module):
        temb: torch.Tensor,
        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
        attention_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
        text_seq_len = encoder_hidden_states.shape[1]
        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)

@@ -594,7 +588,7 @@ class BriaTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOrig
        return_dict: bool = True,
        controlnet_block_samples=None,
        controlnet_single_block_samples=None,
-    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
        """
        The [`BriaTransformer2DModel`] forward method.

@@ -13,7 +13,7 @@
 # limitations under the License.


-from typing import Dict, Tuple, Union
+from typing import Dict, Union

 import torch
 import torch.nn as nn
@@ -79,7 +79,7 @@ class CogView3PlusTransformerBlock(nn.Module):
        hidden_states: torch.Tensor,
        encoder_hidden_states: torch.Tensor,
        emb: torch.Tensor,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
        text_seq_length = encoder_hidden_states.size(1)

        # norm & modulate
@@ -293,7 +293,7 @@ class CogView3PlusTransformer2DModel(ModelMixin, ConfigMixin):
        target_size: torch.Tensor,
        crop_coords: torch.Tensor,
        return_dict: bool = True,
-    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
        """
        The [`CogView3PlusTransformer2DModel`] forward method.

@@ -494,7 +494,7 @@ class CogView4TransformerBlock(nn.Module):
        ] = None,
        attention_mask: Optional[Dict[str, torch.Tensor]] = None,
        attention_kwargs: Optional[Dict[str, Any]] = None,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ) -> torch.Tensor:
        # 1. Timestep conditioning
        (
            norm_hidden_states,
@@ -717,7 +717,7 @@ class CogView4Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Cach
        image_rotary_emb: Optional[
            Union[Tuple[torch.Tensor, torch.Tensor], List[Tuple[torch.Tensor, torch.Tensor]]]
        ] = None,
-    ) -> Union[Tuple[torch.Tensor], Transformer2DModelOutput]:
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
        if attention_kwargs is not None:
            attention_kwargs = attention_kwargs.copy()
            lora_scale = attention_kwargs.pop("scale", 1.0)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
DN6	215af1a602	resolve conflicts	2025-08-28 15:12:03 +05:30
DN6	1a917d3ac5	Revert "merge main" This reverts commit `65efbcead5`.	2025-08-28 15:02:28 +05:30
DN6	65efbcead5	merge main	2025-08-28 14:56:46 +05:30
DN6	2a52a25b9a	update	2025-08-26 10:50:02 +05:30
DN6	0137a16ed5	update	2025-08-26 10:37:30 +05:30
DN6	ce12925a23	update	2025-08-26 09:39:50 +05:30
DN6	80b06b0d5f	update	2025-08-26 08:36:19 +05:30
DN6	42c19fdd0d	update	2025-08-26 08:35:26 +05:30