update

2025-01-14 12:07:45 +05:30
367 changed files with 2481 additions and 13222 deletions
@@ -265,7 +265,7 @@ jobs:

      - name: Run PyTorch CUDA tests
        env:
-          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
@@ -505,7 +505,7 @@ jobs:
 #        shell: arch -arch arm64 bash {0}
 #        env:
 #          HF_HOME: /System/Volumes/Data/mnt/cache
-#          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+#          HF_TOKEN: ${{ secrets.HF_TOKEN }}
 #        run: |
 #          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
 #            --report-log=tests_torch_mps.log \
@@ -561,7 +561,7 @@ jobs:
 #        shell: arch -arch arm64 bash {0}
 #        env:
 #          HF_HOME: /System/Volumes/Data/mnt/cache
-#          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+#          HF_TOKEN: ${{ secrets.HF_TOKEN }}
 #        run: |
 #          ${CONDA_RUN} python -m pytest -n 1 -s -v --make-reports=tests_torch_mps \
 #            --report-log=tests_torch_mps.log \
@@ -137,7 +137,7 @@ jobs:

    - name: Run PyTorch CUDA tests
      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
@@ -187,7 +187,7 @@ jobs:

    - name: Run Flax TPU tests
      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m pytest -n 0 \
          -s -v -k "Flax" \
@@ -235,7 +235,7 @@ jobs:

    - name: Run ONNXRuntime CUDA tests
      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Onnx" \
@@ -283,7 +283,7 @@ jobs:
        python utils/print_env.py
    - name: Run example tests on GPU
      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
        RUN_COMPILE: yes
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
@@ -326,7 +326,7 @@ jobs:
        python utils/print_env.py
    - name: Run example tests on GPU
      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
    - name: Failure short reports
@@ -372,7 +372,7 @@ jobs:

    - name: Run example tests on GPU
      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install timm
@@ -81,7 +81,7 @@ jobs:
          python utils/print_env.py
      - name: Slow PyTorch CUDA checkpoint tests on Ubuntu
        env:
-          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
@@ -135,7 +135,7 @@ jobs:

    - name: Run PyTorch CUDA tests
      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
      run: |
@@ -186,7 +186,7 @@ jobs:

      - name: Run PyTorch CUDA tests
        env:
-          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
          CUBLAS_WORKSPACE_CONFIG: :16:8
        run: |
@@ -241,7 +241,7 @@ jobs:

    - name: Run slow Flax TPU tests
      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m pytest -n 0 \
          -s -v -k "Flax" \
@@ -289,7 +289,7 @@ jobs:

    - name: Run slow ONNXRuntime CUDA tests
      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "Onnx" \
@@ -337,7 +337,7 @@ jobs:
        python utils/print_env.py
    - name: Run example tests on GPU
      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
        RUN_COMPILE: yes
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
@@ -380,7 +380,7 @@ jobs:
        python utils/print_env.py
    - name: Run example tests on GPU
      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
    - name: Failure short reports
@@ -426,7 +426,7 @@ jobs:

    - name: Run example tests on GPU
      env:
-        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
+        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install timm
@@ -79,8 +79,6 @@
 - sections:
  - local: using-diffusers/cogvideox
    title: CogVideoX
-  - local: using-diffusers/consisid
-    title: ConsisID
  - local: using-diffusers/sdxl
    title: Stable Diffusion XL
  - local: using-diffusers/sdxl_turbo
@@ -181,8 +179,6 @@
    title: TGATE
  - local: optimization/xdit
    title: xDiT
-  - local: optimization/para_attn
-    title: ParaAttention
  - sections:
    - local: using-diffusers/stable_diffusion_jax_how_to
      title: JAX/Flax
@@ -272,8 +268,6 @@
        title: AuraFlowTransformer2DModel
      - local: api/models/cogvideox_transformer3d
        title: CogVideoXTransformer3DModel
-      - local: api/models/consisid_transformer3d
-        title: ConsisIDTransformer3DModel
      - local: api/models/cogview3plus_transformer2d
        title: CogView3PlusTransformer2DModel
      - local: api/models/dit_transformer2d
@@ -376,8 +370,6 @@
      title: CogVideoX
    - local: api/pipelines/cogview3
      title: CogView3
-    - local: api/pipelines/consisid
-      title: ConsisID
    - local: api/pipelines/consistency_models
      title: Consistency Models
    - local: api/pipelines/controlnet
@@ -598,8 +590,6 @@
      title: Attention Processor
    - local: api/activations
      title: Custom activation functions
-    - local: api/cache
-      title: Caching methods
    - local: api/normalization
      title: Custom normalization layers
    - local: api/utilities
@@ -1,49 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# Caching methods
-
-## Pyramid Attention Broadcast
-
-[Pyramid Attention Broadcast](https://huggingface.co/papers/2408.12588) from Xuanlei Zhao, Xiaolong Jin, Kai Wang, Yang You.
-
-Pyramid Attention Broadcast (PAB) is a method that speeds up inference in diffusion models by systematically skipping attention computations between successive inference steps and reusing cached attention states. The attention states are not very different between successive inference steps. The most prominent difference is in the spatial attention blocks, not as much in the temporal attention blocks, and finally the least in the cross attention blocks. Therefore, many cross attention computation blocks can be skipped, followed by the temporal and spatial attention blocks. By combining other techniques like sequence parallelism and classifier-free guidance parallelism, PAB achieves near real-time video generation.
-
-Enable PAB with [`~PyramidAttentionBroadcastConfig`] on any pipeline. For some benchmarks, refer to [this](https://github.com/huggingface/diffusers/pull/9562) pull request.
-
-```python
-import torch
-from diffusers import CogVideoXPipeline, PyramidAttentionBroadcastConfig
-
-pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
-pipe.to("cuda")
-
-# Increasing the value of `spatial_attention_timestep_skip_range[0]` or decreasing the value of
-# `spatial_attention_timestep_skip_range[1]` will decrease the interval in which pyramid attention
-# broadcast is active, leader to slower inference speeds. However, large intervals can lead to
-# poorer quality of generated videos.
-config = PyramidAttentionBroadcastConfig(
-    spatial_attention_block_skip_range=2,
-    spatial_attention_timestep_skip_range=(100, 800),
-    current_timestep_callback=lambda: pipe.current_timestep,
-)
-pipe.transformer.enable_cache(config)
-```
-
-### CacheMixin
-
-[[autodoc]] CacheMixin
-
-### PyramidAttentionBroadcastConfig
-
-[[autodoc]] PyramidAttentionBroadcastConfig
-
-[[autodoc]] apply_pyramid_attention_broadcast
@@ -1,30 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# ConsisIDTransformer3DModel
-
-A Diffusion Transformer model for 3D data from [ConsisID](https://github.com/PKU-YuanGroup/ConsisID) was introduced in [Identity-Preserving Text-to-Video Generation by Frequency Decomposition](https://arxiv.org/pdf/2411.17440) by Peking University & University of Rochester & etc.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import ConsisIDTransformer3DModel
-
-transformer = ConsisIDTransformer3DModel.from_pretrained("BestWishYsh/ConsisID-preview", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
-```
-
-## ConsisIDTransformer3DModel
-
-[[autodoc]] ConsisIDTransformer3DModel
-
-## Transformer2DModelOutput
-
-[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
@@ -1,60 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-->
-
-# ConsisID
-
-[Identity-Preserving Text-to-Video Generation by Frequency Decomposition](https://arxiv.org/abs/2411.17440) from Peking University & University of Rochester & etc, by Shenghai Yuan, Jinfa Huang, Xianyi He, Yunyang Ge, Yujun Shi, Liuhan Chen, Jiebo Luo, Li Yuan.
-
-The abstract from the paper is:
-
-*Identity-preserving text-to-video (IPT2V) generation aims to create high-fidelity videos with consistent human identity. It is an important task in video generation but remains an open problem for generative models. This paper pushes the technical frontier of IPT2V in two directions that have not been resolved in the literature: (1) A tuning-free pipeline without tedious case-by-case finetuning, and (2) A frequency-aware heuristic identity-preserving Diffusion Transformer (DiT)-based control scheme. To achieve these goals, we propose **ConsisID**, a tuning-free DiT-based controllable IPT2V model to keep human-**id**entity **consis**tent in the generated video. Inspired by prior findings in frequency analysis of vision/diffusion transformers, it employs identity-control signals in the frequency domain, where facial features can be decomposed into low-frequency global features (e.g., profile, proportions) and high-frequency intrinsic features (e.g., identity markers that remain unaffected by pose changes). First, from a low-frequency perspective, we introduce a global facial extractor, which encodes the reference image and facial key points into a latent space, generating features enriched with low-frequency information. These features are then integrated into the shallow layers of the network to alleviate training challenges associated with DiT. Second, from a high-frequency perspective, we design a local facial extractor to capture high-frequency details and inject them into the transformer blocks, enhancing the model's ability to preserve fine-grained features. To leverage the frequency information for identity preservation, we propose a hierarchical training strategy, transforming a vanilla pre-trained video generation model into an IPT2V model. Extensive experiments demonstrate that our frequency-aware heuristic scheme provides an optimal control solution for DiT-based models. Thanks to this scheme, our **ConsisID** achieves excellent results in generating high-quality, identity-preserving videos, making strides towards more effective IPT2V. The model weight of ConsID is publicly available at https://github.com/PKU-YuanGroup/ConsisID.*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-This pipeline was contributed by [SHYuanBest](https://github.com/SHYuanBest). The original codebase can be found [here](https://github.com/PKU-YuanGroup/ConsisID). The original weights can be found under [hf.co/BestWishYsh](https://huggingface.co/BestWishYsh).
-
-There are two official ConsisID checkpoints for identity-preserving text-to-video.
-
-| checkpoints | recommended inference dtype |
-|:---:|:---:|
-| [`BestWishYsh/ConsisID-preview`](https://huggingface.co/BestWishYsh/ConsisID-preview) | torch.bfloat16 |
-| [`BestWishYsh/ConsisID-1.5`](https://huggingface.co/BestWishYsh/ConsisID-preview) | torch.bfloat16 |
-
-### Memory optimization
-
-ConsisID requires about 44 GB of GPU memory to decode 49 frames (6 seconds of video at 8 FPS) with output resolution 720x480 (W x H), which makes it not possible to run on consumer GPUs or free-tier T4 Colab. The following memory optimizations could be used to reduce the memory footprint. For replication, you can refer to [this](https://gist.github.com/SHYuanBest/bc4207c36f454f9e969adbb50eaf8258) script.
-
-| Feature (overlay the previous) | Max Memory Allocated | Max Memory Reserved |
-| :----------------------------- | :------------------- | :------------------ |
-| -                              | 37 GB                | 44 GB               |
-| enable_model_cpu_offload       | 22 GB                | 25 GB               |
-| enable_sequential_cpu_offload  | 16 GB                | 22 GB               |
-| vae.enable_slicing             | 16 GB                | 22 GB               |
-| vae.enable_tiling              | 5 GB                 | 7 GB                |
-
-## ConsisIDPipeline
-
-[[autodoc]] ConsisIDPipeline
-
-  - all
-  - __call__
-
-## ConsisIDPipelineOutput
-
-[[autodoc]] pipelines.consisid.pipeline_output.ConsisIDPipelineOutput
@@ -309,53 +309,6 @@ image.save("output.png")

 When unloading the Control LoRA weights, call `pipe.unload_lora_weights(reset_to_overwritten_params=True)` to reset the `pipe.transformer` completely back to its original form. The resultant pipeline can then be used with methods like [`DiffusionPipeline.from_pipe`]. More details about this argument are available in [this PR](https://github.com/huggingface/diffusers/pull/10397).

-## IP-Adapter
-
-<Tip>
-
-Check out [IP-Adapter](../../../using-diffusers/ip_adapter) to learn more about how IP-Adapters work.
-
-</Tip>
-
-An IP-Adapter lets you prompt Flux with images, in addition to the text prompt. This is especially useful when describing complex concepts that are difficult to articulate through text alone and you have reference images.
-
-```python
-import torch
-from diffusers import FluxPipeline
-from diffusers.utils import load_image
-
-pipe = FluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16
-).to("cuda")
-
-image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flux_ip_adapter_input.jpg").resize((1024, 1024))
-
-pipe.load_ip_adapter(
-    "XLabs-AI/flux-ip-adapter",
-    weight_name="ip_adapter.safetensors",
-    image_encoder_pretrained_model_name_or_path="openai/clip-vit-large-patch14"
-)
-pipe.set_ip_adapter_scale(1.0)
-
-image = pipe(
-    width=1024,
-    height=1024,
-    prompt="wearing sunglasses",
-    negative_prompt="",
-    true_cfg=4.0,
-    generator=torch.Generator().manual_seed(4444),
-    ip_adapter_image=image,
-).images[0]
-
-image.save('flux_ip_adapter_output.jpg')
-```
-
-<div class="justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flux_ip_adapter_output.jpg"/>
-    <figcaption class="mt-2 text-sm text-center text-gray-500">IP-Adapter examples with prompt "wearing sunglasses"</figcaption>
-</div>
-
-
 ## Running FP16 inference

 Flux can generate high-quality images with FP16 (i.e. to accelerate inference on Turing/Volta GPUs) but produces different outputs compared to FP32/BF16. The issue is that some activations in the text encoders have to be clipped when running in FP16, which affects the overall image. Forcing text encoders to run with FP32 inference thus removes this output difference. See [here](https://github.com/huggingface/diffusers/pull/9097#issuecomment-2272292516) for details.
@@ -115,7 +115,7 @@ export_to_video(frames, "mochi.mp4", fps=30)

 ## Reproducing the results from the Genmo Mochi repo

-The [Genmo Mochi implementation](https://github.com/genmoai/mochi/tree/main) uses different precision values for each stage in the inference process. The text encoder and VAE use `torch.float32`, while the DiT uses `torch.bfloat16` with the [attention kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html#torch.nn.attention.sdpa_kernel) set to `EFFICIENT_ATTENTION`. Diffusers pipelines currently do not support setting different `dtypes` for different stages of the pipeline. In order to run inference in the same way as the original implementation, please refer to the following example.
+The [Genmo Mochi implementation](https://github.com/genmoai/mochi/tree/main) uses different precision values for each stage in the inference process. The text encoder and VAE use `torch.float32`, while the DiT uses `torch.bfloat16` with the [attention kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html#torch.nn.attention.sdpa_kernel) set to `EFFICIENT_ATTENTION`. Diffusers pipelines currently do not support setting different `dtypes` for different stages of the pipeline. In order to run inference in the same way as the the original implementation, please refer to the following example.

 <Tip>
 The original Mochi implementation zeros out empty prompts. However, enabling this option and placing the entire pipeline under autocast can lead to numerical overflows with the T5 text encoder.
@@ -77,7 +77,7 @@ from diffusers import StableDiffusion3Pipeline
 from transformers import SiglipVisionModel, SiglipImageProcessor

 image_encoder_id = "google/siglip-so400m-patch14-384"
-ip_adapter_id = "guiyrt/InstantX-SD3.5-Large-IP-Adapter-diffusers"
+ip_adapter_id = "InstantX/SD3.5-Large-IP-Adapter"

 feature_extractor = SiglipImageProcessor.from_pretrained(
    image_encoder_id,
@@ -41,7 +41,3 @@ Utility and helper functions for working with 🤗 Diffusers.
 ## randn_tensor

 [[autodoc]] utils.torch_utils.randn_tensor
-
-## apply_layerwise_casting
-
-[[autodoc]] hooks.layerwise_casting.apply_layerwise_casting
@@ -23,60 +23,32 @@ You should install 🤗 Diffusers in a [virtual environment](https://docs.python
 If you're unfamiliar with Python virtual environments, take a look at this [guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).
 A virtual environment makes it easier to manage different projects and avoid compatibility issues between dependencies.

-Create a virtual environment with Python or [uv](https://docs.astral.sh/uv/) (refer to [Installation](https://docs.astral.sh/uv/getting-started/installation/) for installation instructions), a fast Rust-based Python package and project manager.
-
-<hfoptions id="install">
-<hfoption id="uv">
+Start by creating a virtual environment in your project directory:

 ```bash
-uv venv my-env
-source my-env/bin/activate
+python -m venv .env
 ```

-</hfoption>
-<hfoption id="Python">
+Activate the virtual environment:

 ```bash
-python -m venv my-env
-source my-env/bin/activate
+source .env/bin/activate
 ```

-</hfoption>
-</hfoptions>
-
-You should also install 🤗 Transformers because 🤗 Diffusers relies on its models.
+You should also install 🤗 Transformers because 🤗 Diffusers relies on its models:


 <frameworkcontent>
 <pt>
-
-PyTorch only supports Python 3.8 - 3.11 on Windows. Install Diffusers with uv.
-
-```bash
-uv install diffusers["torch"] transformers
-```
-
-You can also install Diffusers with pip.
-
+Note - PyTorch only supports Python 3.8 - 3.11 on Windows.
 ```bash
 pip install diffusers["torch"] transformers
 ```
-
 </pt>
 <jax>
-
-Install Diffusers with uv.
-
-```bash
-uv pip install diffusers["flax"] transformers
-```
-
-You can also install Diffusers with pip.
-
 ```bash
 pip install diffusers["flax"] transformers
 ```
-
 </jax>
 </frameworkcontent>

@@ -158,43 +158,6 @@ In order to properly offload models after they're called, it is required to run

 </Tip>

-## FP8 layerwise weight-casting
-
-PyTorch supports `torch.float8_e4m3fn` and `torch.float8_e5m2` as weight storage dtypes, but they can't be used for computation in many different tensor operations due to unimplemented kernel support. However, you can use these dtypes to store model weights in fp8 precision and upcast them on-the-fly when the layers are used in the forward pass. This is known as layerwise weight-casting.
-
-Typically, inference on most models is done with `torch.float16` or `torch.bfloat16` weight/computation precision. Layerwise weight-casting cuts down the memory footprint of the model weights by approximately half.
-
-```python
-import torch
-from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
-from diffusers.utils import export_to_video
-
-model_id = "THUDM/CogVideoX-5b"
-
-# Load the model in bfloat16 and enable layerwise casting
-transformer = CogVideoXTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.bfloat16)
-transformer.enable_layerwise_casting(storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16)
-
-# Load the pipeline
-pipe = CogVideoXPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch.bfloat16)
-pipe.to("cuda")
-
-prompt = (
-    "A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
-    "The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
-    "pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, "
-    "casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. "
-    "The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
-    "atmosphere of this unique musical performance."
-)
-video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
-export_to_video(video, "output.mp4", fps=8)
-```
-
-In the above example, layerwise casting is enabled on the transformer component of the pipeline. By default, certain layers are skipped from the FP8 weight casting because it can lead to significant degradation of generation quality. The normalization and modulation related weight parameters are also skipped by default.
-
-However, you gain more control and flexibility by directly utilizing the [`~hooks.layerwise_casting.apply_layerwise_casting`] function instead of [`~ModelMixin.enable_layerwise_casting`].
-
 ## Channels-last memory format

 The channels-last memory format is an alternative way of ordering NCHW tensors in memory to preserve dimension ordering. Channels-last tensors are ordered in such a way that the channels become the densest dimension (storing images pixel-per-pixel). Since not all operators currently support the channels-last format, it may result in worst performance but you should still try and see if it works for your model.
@@ -1,497 +0,0 @@
-# ParaAttention
-
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-performance.png">
-</div>
-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/hunyuan-video-performance.png">
-</div>
-
-
-Large image and video generation models, such as [FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) and [HunyuanVideo](https://huggingface.co/tencent/HunyuanVideo), can be an inference challenge for real-time applications and deployment because of their size.
-
-[ParaAttention](https://github.com/chengzeyi/ParaAttention) is a library that implements **context parallelism** and **first block cache**, and can be combined with other techniques (torch.compile, fp8 dynamic quantization), to accelerate inference.
-
-This guide will show you how to apply ParaAttention to FLUX.1-dev and HunyuanVideo on NVIDIA L20 GPUs.
-No optimizations are applied for our baseline benchmark, except for HunyuanVideo to avoid out-of-memory errors.
-
-Our baseline benchmark shows that FLUX.1-dev is able to generate a 1024x1024 resolution image in 28 steps in 26.36 seconds, and HunyuanVideo is able to generate 129 frames at 720p resolution in 30 steps in 3675.71 seconds.
-
-> [!TIP]
-> For even faster inference with context parallelism, try using NVIDIA A100 or H100 GPUs (if available) with NVLink support, especially when there is a large number of GPUs.
-
-## First Block Cache
-
-Caching the output of the transformers blocks in the model and reusing them in the next inference steps reduces the computation cost and makes inference faster.
-
-However, it is hard to decide when to reuse the cache to ensure quality generated images or videos. ParaAttention directly uses the **residual difference of the first transformer block output** to approximate the difference among model outputs. When the difference is small enough, the residual difference of previous inference steps is reused. In other words, the denoising step is skipped.
-
-This achieves a 2x speedup on FLUX.1-dev and HunyuanVideo inference with very good quality.
-
-<figure>
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/ada-cache.png" alt="Cache in Diffusion Transformer" />
-    <figcaption>How AdaCache works, First Block Cache is a variant of it</figcaption>
-</figure>
-
-<hfoptions id="first-block-cache">
-<hfoption id="FLUX-1.dev">
-
-To apply first block cache on FLUX.1-dev, call `apply_cache_on_pipe` as shown below. 0.08 is the default residual difference value for FLUX models.
-
-```python
-import time
-import torch
-from diffusers import FluxPipeline
-
-pipe = FluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    torch_dtype=torch.bfloat16,
-).to("cuda")
-
-from para_attn.first_block_cache.diffusers_adapters import apply_cache_on_pipe
-
-apply_cache_on_pipe(pipe, residual_diff_threshold=0.08)
-
-# Enable memory savings
-# pipe.enable_model_cpu_offload()
-# pipe.enable_sequential_cpu_offload()
-
-begin = time.time()
-image = pipe(
-    "A cat holding a sign that says hello world",
-    num_inference_steps=28,
-).images[0]
-end = time.time()
-print(f"Time: {end - begin:.2f}s")
-
-print("Saving image to flux.png")
-image.save("flux.png")
-```
-
-| Optimizations | Original | FBCache rdt=0.06 | FBCache rdt=0.08 | FBCache rdt=0.10 | FBCache rdt=0.12 |
-| - | - | - | - | - | - |
-| Preview | ![Original](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-original.png) | ![FBCache rdt=0.06](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-fbc-0.06.png) | ![FBCache rdt=0.08](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-fbc-0.08.png) | ![FBCache rdt=0.10](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-fbc-0.10.png) | ![FBCache rdt=0.12](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/flux-fbc-0.12.png) |
-| Wall Time (s) | 26.36 | 21.83 | 17.01 | 16.00 | 13.78 |
-
-First Block Cache reduced the inference speed to 17.01 seconds compared to the baseline, or 1.55x faster, while maintaining nearly zero quality loss.
-
-</hfoption>
-<hfoption id="HunyuanVideo">
-
-To apply First Block Cache on HunyuanVideo, `apply_cache_on_pipe` as shown below. 0.06 is the default residual difference value for HunyuanVideo models.
-
-```python
-import time
-import torch
-from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
-from diffusers.utils import export_to_video
-
-model_id = "tencent/HunyuanVideo"
-transformer = HunyuanVideoTransformer3DModel.from_pretrained(
-    model_id,
-    subfolder="transformer",
-    torch_dtype=torch.bfloat16,
-    revision="refs/pr/18",
-)
-pipe = HunyuanVideoPipeline.from_pretrained(
-    model_id,
-    transformer=transformer,
-    torch_dtype=torch.float16,
-    revision="refs/pr/18",
-).to("cuda")
-
-from para_attn.first_block_cache.diffusers_adapters import apply_cache_on_pipe
-
-apply_cache_on_pipe(pipe, residual_diff_threshold=0.6)
-
-pipe.vae.enable_tiling()
-
-begin = time.time()
-output = pipe(
-    prompt="A cat walks on the grass, realistic",
-    height=720,
-    width=1280,
-    num_frames=129,
-    num_inference_steps=30,
-).frames[0]
-end = time.time()
-print(f"Time: {end - begin:.2f}s")
-
-print("Saving video to hunyuan_video.mp4")
-export_to_video(output, "hunyuan_video.mp4", fps=15)
-```
-
-<video controls>
-  <source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/hunyuan-video-original.mp4" type="video/mp4">
-  Your browser does not support the video tag.
-</video>
-
-<small> HunyuanVideo without FBCache </small>
-
-<video controls>
-  <source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/para-attn/hunyuan-video-fbc.mp4" type="video/mp4">
-  Your browser does not support the video tag.
-</video>
-
-<small> HunyuanVideo with FBCache </small>
-
-First Block Cache reduced the inference speed to 2271.06 seconds compared to the baseline, or 1.62x faster, while maintaining nearly zero quality loss.
-
-</hfoption>
-</hfoptions>
-
-## fp8 quantization
-
-fp8 with dynamic quantization further speeds up inference and reduces memory usage. Both the activations and weights must be quantized in order to use the 8-bit [NVIDIA Tensor Cores](https://www.nvidia.com/en-us/data-center/tensor-cores/).
-
-Use `float8_weight_only` and `float8_dynamic_activation_float8_weight` to quantize the text encoder and transformer model.
-
-The default quantization method is per tensor quantization, but if your GPU supports row-wise quantization, you can also try it for better accuracy.
-
-Install [torchao](https://github.com/pytorch/ao/tree/main) with the command below.
-
-```bash
-pip3 install -U torch torchao
-```
-
-[torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) with `mode="max-autotune-no-cudagraphs"` or `mode="max-autotune"` selects the best kernel for performance. Compilation can take a long time if it's the first time the model is called, but it is worth it once the model has been compiled.
-
-This example only quantizes the transformer model, but you can also quantize the text encoder to reduce memory usage even more.
-
-> [!TIP]
-> Dynamic quantization can significantly change the distribution of the model output, so you need to change the `residual_diff_threshold` to a larger value for it to take effect.
-
-<hfoptions id="fp8-quantization">
-<hfoption id="FLUX-1.dev">
-
-```python
-import time
-import torch
-from diffusers import FluxPipeline
-
-pipe = FluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    torch_dtype=torch.bfloat16,
-).to("cuda")
-
-from para_attn.first_block_cache.diffusers_adapters import apply_cache_on_pipe
-
-apply_cache_on_pipe(
-    pipe,
-    residual_diff_threshold=0.12,  # Use a larger value to make the cache take effect
-)
-
-from torchao.quantization import quantize_, float8_dynamic_activation_float8_weight, float8_weight_only
-
-quantize_(pipe.text_encoder, float8_weight_only())
-quantize_(pipe.transformer, float8_dynamic_activation_float8_weight())
-pipe.transformer = torch.compile(
-   pipe.transformer, mode="max-autotune-no-cudagraphs",
-)
-
-# Enable memory savings
-# pipe.enable_model_cpu_offload()
-# pipe.enable_sequential_cpu_offload()
-
-for i in range(2):
-    begin = time.time()
-    image = pipe(
-        "A cat holding a sign that says hello world",
-        num_inference_steps=28,
-    ).images[0]
-    end = time.time()
-    if i == 0:
-        print(f"Warm up time: {end - begin:.2f}s")
-    else:
-        print(f"Time: {end - begin:.2f}s")
-
-print("Saving image to flux.png")
-image.save("flux.png")
-```
-
-fp8 dynamic quantization and torch.compile reduced the inference speed to 7.56 seconds compared to the baseline, or 3.48x faster.
-
-</hfoption>
-<hfoption id="HunyuanVideo">
-
-```python
-import time
-import torch
-from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
-from diffusers.utils import export_to_video
-
-model_id = "tencent/HunyuanVideo"
-transformer = HunyuanVideoTransformer3DModel.from_pretrained(
-    model_id,
-    subfolder="transformer",
-    torch_dtype=torch.bfloat16,
-    revision="refs/pr/18",
-)
-pipe = HunyuanVideoPipeline.from_pretrained(
-    model_id,
-    transformer=transformer,
-    torch_dtype=torch.float16,
-    revision="refs/pr/18",
-).to("cuda")
-
-from para_attn.first_block_cache.diffusers_adapters import apply_cache_on_pipe
-
-apply_cache_on_pipe(pipe)
-
-from torchao.quantization import quantize_, float8_dynamic_activation_float8_weight, float8_weight_only
-
-quantize_(pipe.text_encoder, float8_weight_only())
-quantize_(pipe.transformer, float8_dynamic_activation_float8_weight())
-pipe.transformer = torch.compile(
-   pipe.transformer, mode="max-autotune-no-cudagraphs",
-)
-
-# Enable memory savings
-pipe.vae.enable_tiling()
-# pipe.enable_model_cpu_offload()
-# pipe.enable_sequential_cpu_offload()
-
-for i in range(2):
-    begin = time.time()
-    output = pipe(
-        prompt="A cat walks on the grass, realistic",
-        height=720,
-        width=1280,
-        num_frames=129,
-        num_inference_steps=1 if i == 0 else 30,
-    ).frames[0]
-    end = time.time()
-    if i == 0:
-        print(f"Warm up time: {end - begin:.2f}s")
-    else:
-        print(f"Time: {end - begin:.2f}s")
-
-print("Saving video to hunyuan_video.mp4")
-export_to_video(output, "hunyuan_video.mp4", fps=15)
-```
-
-A NVIDIA L20 GPU only has 48GB memory and could face out-of-memory (OOM) errors after compilation and if `enable_model_cpu_offload` isn't called because HunyuanVideo has very large activation tensors when running with high resolution and large number of frames. For GPUs with less than 80GB of memory, you can try reducing the resolution and number of frames to avoid OOM errors.
-
-Large video generation models are usually bottlenecked by the attention computations rather than the fully connected layers. These models don't significantly benefit from quantization and torch.compile.
-
-</hfoption>
-</hfoptions>
-
-## Context Parallelism
-
-Context Parallelism parallelizes inference and scales with multiple GPUs. The ParaAttention compositional design allows you to combine Context Parallelism with First Block Cache and dynamic quantization.
-
-> [!TIP]
-> Refer to the [ParaAttention](https://github.com/chengzeyi/ParaAttention/tree/main) repository for detailed instructions and examples of how to scale inference with multiple GPUs.
-
-If the inference process needs to be persistent and serviceable, it is suggested to use [torch.multiprocessing](https://pytorch.org/docs/stable/multiprocessing.html) to write your own inference processor. This can eliminate the overhead of launching the process and loading and recompiling the model.
-
-<hfoptions id="context-parallelism">
-<hfoption id="FLUX-1.dev">
-
-The code sample below combines First Block Cache, fp8 dynamic quantization, torch.compile, and Context Parallelism for the fastest inference speed.
-
-```python
-import time
-import torch
-import torch.distributed as dist
-from diffusers import FluxPipeline
-
-dist.init_process_group()
-
-torch.cuda.set_device(dist.get_rank())
-
-pipe = FluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    torch_dtype=torch.bfloat16,
-).to("cuda")
-
-from para_attn.context_parallel import init_context_parallel_mesh
-from para_attn.context_parallel.diffusers_adapters import parallelize_pipe
-from para_attn.parallel_vae.diffusers_adapters import parallelize_vae
-
-mesh = init_context_parallel_mesh(
-    pipe.device.type,
-    max_ring_dim_size=2,
-)
-parallelize_pipe(
-    pipe,
-    mesh=mesh,
-)
-parallelize_vae(pipe.vae, mesh=mesh._flatten())
-
-from para_attn.first_block_cache.diffusers_adapters import apply_cache_on_pipe
-
-apply_cache_on_pipe(
-    pipe,
-    residual_diff_threshold=0.12,  # Use a larger value to make the cache take effect
-)
-
-from torchao.quantization import quantize_, float8_dynamic_activation_float8_weight, float8_weight_only
-
-quantize_(pipe.text_encoder, float8_weight_only())
-quantize_(pipe.transformer, float8_dynamic_activation_float8_weight())
-torch._inductor.config.reorder_for_compute_comm_overlap = True
-pipe.transformer = torch.compile(
-   pipe.transformer, mode="max-autotune-no-cudagraphs",
-)
-
-# Enable memory savings
-# pipe.enable_model_cpu_offload(gpu_id=dist.get_rank())
-# pipe.enable_sequential_cpu_offload(gpu_id=dist.get_rank())
-
-for i in range(2):
-    begin = time.time()
-    image = pipe(
-        "A cat holding a sign that says hello world",
-        num_inference_steps=28,
-        output_type="pil" if dist.get_rank() == 0 else "pt",
-    ).images[0]
-    end = time.time()
-    if dist.get_rank() == 0:
-        if i == 0:
-            print(f"Warm up time: {end - begin:.2f}s")
-        else:
-            print(f"Time: {end - begin:.2f}s")
-
-if dist.get_rank() == 0:
-    print("Saving image to flux.png")
-    image.save("flux.png")
-
-dist.destroy_process_group()
-```
-
-Save to `run_flux.py` and launch it with [torchrun](https://pytorch.org/docs/stable/elastic/run.html).
-
-```bash
-# Use --nproc_per_node to specify the number of GPUs
-torchrun --nproc_per_node=2 run_flux.py
-```
-
-Inference speed is reduced to 8.20 seconds compared to the baseline, or 3.21x faster, with 2 NVIDIA L20 GPUs. On 4 L20s, inference speed is 3.90 seconds, or 6.75x faster.
-
-</hfoption>
-<hfoption id="HunyuanVideo">
-
-The code sample below combines First Block Cache and Context Parallelism for the fastest inference speed.
-
-```python
-import time
-import torch
-import torch.distributed as dist
-from diffusers import HunyuanVideoPipeline, HunyuanVideoTransformer3DModel
-from diffusers.utils import export_to_video
-
-dist.init_process_group()
-
-torch.cuda.set_device(dist.get_rank())
-
-model_id = "tencent/HunyuanVideo"
-transformer = HunyuanVideoTransformer3DModel.from_pretrained(
-    model_id,
-    subfolder="transformer",
-    torch_dtype=torch.bfloat16,
-    revision="refs/pr/18",
-)
-pipe = HunyuanVideoPipeline.from_pretrained(
-    model_id,
-    transformer=transformer,
-    torch_dtype=torch.float16,
-    revision="refs/pr/18",
-).to("cuda")
-
-from para_attn.context_parallel import init_context_parallel_mesh
-from para_attn.context_parallel.diffusers_adapters import parallelize_pipe
-from para_attn.parallel_vae.diffusers_adapters import parallelize_vae
-
-mesh = init_context_parallel_mesh(
-    pipe.device.type,
-)
-parallelize_pipe(
-    pipe,
-    mesh=mesh,
-)
-parallelize_vae(pipe.vae, mesh=mesh._flatten())
-
-from para_attn.first_block_cache.diffusers_adapters import apply_cache_on_pipe
-
-apply_cache_on_pipe(pipe)
-
-# from torchao.quantization import quantize_, float8_dynamic_activation_float8_weight, float8_weight_only
-#
-# torch._inductor.config.reorder_for_compute_comm_overlap = True
-#
-# quantize_(pipe.text_encoder, float8_weight_only())
-# quantize_(pipe.transformer, float8_dynamic_activation_float8_weight())
-# pipe.transformer = torch.compile(
-#    pipe.transformer, mode="max-autotune-no-cudagraphs",
-# )
-
-# Enable memory savings
-pipe.vae.enable_tiling()
-# pipe.enable_model_cpu_offload(gpu_id=dist.get_rank())
-# pipe.enable_sequential_cpu_offload(gpu_id=dist.get_rank())
-
-for i in range(2):
-    begin = time.time()
-    output = pipe(
-        prompt="A cat walks on the grass, realistic",
-        height=720,
-        width=1280,
-        num_frames=129,
-        num_inference_steps=1 if i == 0 else 30,
-        output_type="pil" if dist.get_rank() == 0 else "pt",
-    ).frames[0]
-    end = time.time()
-    if dist.get_rank() == 0:
-        if i == 0:
-            print(f"Warm up time: {end - begin:.2f}s")
-        else:
-            print(f"Time: {end - begin:.2f}s")
-
-if dist.get_rank() == 0:
-    print("Saving video to hunyuan_video.mp4")
-    export_to_video(output, "hunyuan_video.mp4", fps=15)
-
-dist.destroy_process_group()
-```
-
-Save to `run_hunyuan_video.py` and launch it with [torchrun](https://pytorch.org/docs/stable/elastic/run.html).
-
-```bash
-# Use --nproc_per_node to specify the number of GPUs
-torchrun --nproc_per_node=8 run_hunyuan_video.py
-```
-
-Inference speed is reduced to 649.23 seconds compared to the baseline, or 5.66x faster, with 8 NVIDIA L20 GPUs.
-
-</hfoption>
-</hfoptions>
-
-## Benchmarks
-
-<hfoptions id="conclusion">
-<hfoption id="FLUX-1.dev">
-
-| GPU Type | Number of GPUs | Optimizations | Wall Time (s) | Speedup |
-| - | - | - | - | - |
-| NVIDIA L20 | 1 | Baseline | 26.36 | 1.00x |
-| NVIDIA L20 | 1 | FBCache (rdt=0.08) | 17.01 | 1.55x |
-| NVIDIA L20 | 1 | FP8 DQ | 13.40 | 1.96x |
-| NVIDIA L20 | 1 | FBCache (rdt=0.12) + FP8 DQ | 7.56 | 3.48x |
-| NVIDIA L20 | 2 | FBCache (rdt=0.12) + FP8 DQ + CP | 4.92 | 5.35x |
-| NVIDIA L20 | 4 | FBCache (rdt=0.12) + FP8 DQ + CP | 3.90 | 6.75x |
-
-</hfoption>
-<hfoption id="HunyuanVideo">
-
-| GPU Type | Number of GPUs | Optimizations | Wall Time (s) | Speedup |
-| - | - | - | - | - |
-| NVIDIA L20 | 1 | Baseline | 3675.71 | 1.00x |
-| NVIDIA L20 | 1 | FBCache | 2271.06 | 1.62x |
-| NVIDIA L20 | 2 | FBCache + CP | 1132.90 | 3.24x |
-| NVIDIA L20 | 4 | FBCache + CP | 718.15 | 5.12x |
-| NVIDIA L20 | 8 | FBCache + CP | 649.23 | 5.66x |
-
-</hfoption>
-</hfoptions>
@@ -1,96 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-# ConsisID
-
-[ConsisID](https://github.com/PKU-YuanGroup/ConsisID) is an identity-preserving text-to-video generation model that keeps the face consistent in the generated video by frequency decomposition. The main features of ConsisID are:
-
- Frequency decomposition: The characteristics of the DiT architecture are analyzed from the frequency domain perspective, and based on these characteristics, a reasonable control information injection method is designed.
- Consistency training strategy: A coarse-to-fine training strategy, dynamic masking loss, and dynamic cross-face loss further enhance the model's generalization ability and identity preservation performance.
- Inference without finetuning: Previous methods required case-by-case finetuning of the input ID before inference, leading to significant time and computational costs. In contrast, ConsisID is tuning-free.
-
-This guide will walk you through using ConsisID for use cases.
-
-## Load Model Checkpoints
-
-Model weights may be stored in separate subfolders on the Hub or locally, in which case, you should use the [`~DiffusionPipeline.from_pretrained`] method.
-
-```python
-# !pip install consisid_eva_clip insightface facexlib
-import torch
-from diffusers import ConsisIDPipeline
-from diffusers.pipelines.consisid.consisid_utils import prepare_face_models, process_face_embeddings_infer
-from huggingface_hub import snapshot_download
-
-# Download ckpts
-snapshot_download(repo_id="BestWishYsh/ConsisID-preview", local_dir="BestWishYsh/ConsisID-preview")
-
-# Load face helper model to preprocess input face image
-face_helper_1, face_helper_2, face_clip_model, face_main_model, eva_transform_mean, eva_transform_std = prepare_face_models("BestWishYsh/ConsisID-preview", device="cuda", dtype=torch.bfloat16)
-
-# Load consisid base model
-pipe = ConsisIDPipeline.from_pretrained("BestWishYsh/ConsisID-preview", torch_dtype=torch.bfloat16)
-pipe.to("cuda")
-```
-
-## Identity-Preserving Text-to-Video
-
-For identity-preserving text-to-video, pass a text prompt and an image contain clear face (e.g., preferably half-body or full-body). By default, ConsisID generates a 720x480 video for the best results.
-
-```python
-from diffusers.utils import export_to_video
-
-prompt = "The video captures a boy walking along a city street, filmed in black and white on a classic 35mm camera. His expression is thoughtful, his brow slightly furrowed as if he's lost in contemplation. The film grain adds a textured, timeless quality to the image, evoking a sense of nostalgia. Around him, the cityscape is filled with vintage buildings, cobblestone sidewalks, and softly blurred figures passing by, their outlines faint and indistinct. Streetlights cast a gentle glow, while shadows play across the boy's path, adding depth to the scene. The lighting highlights the boy's subtle smile, hinting at a fleeting moment of curiosity. The overall cinematic atmosphere, complete with classic film still aesthetics and dramatic contrasts, gives the scene an evocative and introspective feel."
-image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_input.png?download=true"
-
-id_cond, id_vit_hidden, image, face_kps = process_face_embeddings_infer(face_helper_1, face_clip_model, face_helper_2, eva_transform_mean, eva_transform_std, face_main_model, "cuda", torch.bfloat16, image, is_align_face=True)
-
-video = pipe(image=image, prompt=prompt, num_inference_steps=50, guidance_scale=6.0, use_dynamic_cfg=False, id_vit_hidden=id_vit_hidden, id_cond=id_cond, kps_cond=face_kps, generator=torch.Generator("cuda").manual_seed(42))
-export_to_video(video.frames[0], "output.mp4", fps=8)
-```
-<table>
-  <tr>
-    <th style="text-align: center;">Face Image</th>
-    <th style="text-align: center;">Video</th>
-    <th style="text-align: center;">Description</th
-  </tr>
-  <tr>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_image_0.png?download=true" style="height: auto; width: 600px;"></td>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_output_0.gif?download=true" style="height: auto; width: 2000px;"></td>
-    <td>The video, in a beautifully crafted animated style, features a confident woman riding a horse through a lush forest clearing. Her expression is focused yet serene as she adjusts her wide-brimmed hat with a practiced hand. She wears a flowy bohemian dress, which moves gracefully with the rhythm of the horse, the fabric flowing fluidly in the animated motion. The dappled sunlight filters through the trees, casting soft, painterly patterns on the forest floor. Her posture is poised, showing both control and elegance as she guides the horse with ease. The animation's gentle, fluid style adds a dreamlike quality to the scene, with the woman’s calm demeanor and the peaceful surroundings evoking a sense of freedom and harmony.</td>
-  </tr>
-  <tr>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_image_1.png?download=true" style="height: auto; width: 600px;"></td>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_output_1.gif?download=true" style="height: auto; width: 2000px;"></td>
-    <td>The video, in a captivating animated style, shows a woman standing in the center of a snowy forest, her eyes narrowed in concentration as she extends her hand forward. She is dressed in a deep blue cloak, her breath visible in the cold air, which is rendered with soft, ethereal strokes. A faint smile plays on her lips as she summons a wisp of ice magic, watching with focus as the surrounding trees and ground begin to shimmer and freeze, covered in delicate ice crystals. The animation’s fluid motion brings the magic to life, with the frost spreading outward in intricate, sparkling patterns. The environment is painted with soft, watercolor-like hues, enhancing the magical, dreamlike atmosphere. The overall mood is serene yet powerful, with the quiet winter air amplifying the delicate beauty of the frozen scene.</td>
-  </tr>
-  <tr>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_image_2.png?download=true" style="height: auto; width: 600px;"></td>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_output_2.gif?download=true" style="height: auto; width: 2000px;"></td>
-    <td>The animation features a whimsical portrait of a balloon seller standing in a gentle breeze, captured with soft, hazy brushstrokes that evoke the feel of a serene spring day. His face is framed by a gentle smile, his eyes squinting slightly against the sun, while a few wisps of hair flutter in the wind. He is dressed in a light, pastel-colored shirt, and the balloons around him sway with the wind, adding a sense of playfulness to the scene. The background blurs softly, with hints of a vibrant market or park, enhancing the light-hearted, yet tender mood of the moment.</td>
-  </tr>
-  <tr>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_image_3.png?download=true" style="height: auto; width: 600px;"></td>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_output_3.gif?download=true" style="height: auto; width: 2000px;"></td>
-    <td>The video captures a boy walking along a city street, filmed in black and white on a classic 35mm camera. His expression is thoughtful, his brow slightly furrowed as if he's lost in contemplation. The film grain adds a textured, timeless quality to the image, evoking a sense of nostalgia. Around him, the cityscape is filled with vintage buildings, cobblestone sidewalks, and softly blurred figures passing by, their outlines faint and indistinct. Streetlights cast a gentle glow, while shadows play across the boy's path, adding depth to the scene. The lighting highlights the boy's subtle smile, hinting at a fleeting moment of curiosity. The overall cinematic atmosphere, complete with classic film still aesthetics and dramatic contrasts, gives the scene an evocative and introspective feel.</td>
-  </tr>
-  <tr>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_image_4.png?download=true" style="height: auto; width: 600px;"></td>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_output_4.gif?download=true" style="height: auto; width: 2000px;"></td>
-    <td>The video features a baby wearing a bright superhero cape, standing confidently with arms raised in a powerful pose. The baby has a determined look on their face, with eyes wide and lips pursed in concentration, as if ready to take on a challenge. The setting appears playful, with colorful toys scattered around and a soft rug underfoot, while sunlight streams through a nearby window, highlighting the fluttering cape and adding to the impression of heroism. The overall atmosphere is lighthearted and fun, with the baby's expressions capturing a mix of innocence and an adorable attempt at bravery, as if truly ready to save the day.</td>
-  </tr>
-</table>
-
-## Resources
-
-Learn more about ConsisID with the following resources.
- A [video](https://www.youtube.com/watch?v=PhlgC-bI5SQ) demonstrating ConsisID's main features.
- The research paper, [Identity-Preserving Text-to-Video Generation by Frequency Decomposition](https://hf.co/papers/2411.17440) for more details.
@@ -240,46 +240,6 @@ Benefits of using a single-file layout include:
 1. Easy compatibility with diffusion interfaces such as [ComfyUI](https://github.com/comfyanonymous/ComfyUI) or [Automatic1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui) which commonly use a single-file layout.
 2. Easier to manage (download and share) a single file.

-### DDUF
-
-> [!WARNING]
-> DDUF is an experimental file format and APIs related to it can change in the future.
-
-DDUF (**D**DUF **D**iffusion **U**nified **F**ormat) is a file format designed to make storing, distributing, and using diffusion models much easier. Built on the ZIP file format, DDUF offers a standardized, efficient, and flexible way to package all parts of a diffusion model into a single, easy-to-manage file. It provides a balance between Diffusers multi-folder format and the widely popular single-file format.
-
-Learn more details about DDUF on the Hugging Face Hub [documentation](https://huggingface.co/docs/hub/dduf).
-
-Pass a checkpoint to the `dduf_file` parameter to load it in [`DiffusionPipeline`].
-
-```py
-from diffusers import DiffusionPipeline
-import torch
-
-pipe = DiffusionPipeline.from_pretrained(
-    "DDUF/FLUX.1-dev-DDUF", dduf_file="FLUX.1-dev.dduf", torch_dtype=torch.bfloat16
-).to("cuda")
-image = pipe(
-    "photo a cat holding a sign that says Diffusers", num_inference_steps=50, guidance_scale=3.5
-).images[0]
-image.save("cat.png")
-```
-
-To save a pipeline as a `.dduf` checkpoint, use the [`~huggingface_hub.export_folder_as_dduf`] utility, which takes care of all the necessary file-level validations.
-
-```py
-from huggingface_hub import export_folder_as_dduf
-from diffusers import DiffusionPipeline
-import torch 
-
-pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
-
-save_folder = "flux-dev"
-pipe.save_pretrained("flux-dev")
-export_folder_as_dduf("flux-dev.dduf", folder_path=save_folder)
-
-> [!TIP]
-> Packaging and loading quantized checkpoints in the DDUF format is supported as long as they respect the multi-folder structure.
-
 ## Convert layout and files

 Diffusers provides many scripts and methods to convert storage layouts and file formats to enable broader support across the diffusion ecosystem.
@@ -5,8 +5,6 @@
    title: 快速入门
  - local: stable_diffusion
    title: 有效和高效的扩散
-  - local: consisid 
-    title: 身份保持的文本到视频生成
  - local: installation
    title: 安装
  title: 开始
@@ -1,100 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-# ConsisID
-
-[ConsisID](https://github.com/PKU-YuanGroup/ConsisID)是一种身份保持的文本到视频生成模型，其通过频率分解在生成的视频中保持面部一致性。它具有以下特点：
-
- 基于频率分解：将人物ID特征解耦为高频和低频部分，从频域的角度分析DIT架构的特性，并且基于此特性设计合理的控制信息注入方式。
-
- 一致性训练策略：我们提出粗到细训练策略、动态掩码损失、动态跨脸损失，进一步提高了模型的泛化能力和身份保持效果。
-
-
- 推理无需微调：之前的方法在推理前，需要对输入id进行case-by-case微调，时间和算力开销较大，而我们的方法是tuning-free的。
-
-
-本指南将指导您使用 ConsisID 生成身份保持的视频。
-
-## Load Model Checkpoints
-模型权重可以存储在Hub上或本地的单独子文件夹中，在这种情况下，您应该使用 [`~DiffusionPipeline.from_pretrained`] 方法。
-
-
-```python
-# !pip install consisid_eva_clip insightface facexlib
-import torch
-from diffusers import ConsisIDPipeline
-from diffusers.pipelines.consisid.consisid_utils import prepare_face_models, process_face_embeddings_infer
-from huggingface_hub import snapshot_download
-
-# Download ckpts
-snapshot_download(repo_id="BestWishYsh/ConsisID-preview", local_dir="BestWishYsh/ConsisID-preview")
-
-# Load face helper model to preprocess input face image
-face_helper_1, face_helper_2, face_clip_model, face_main_model, eva_transform_mean, eva_transform_std = prepare_face_models("BestWishYsh/ConsisID-preview", device="cuda", dtype=torch.bfloat16)
-
-# Load consisid base model
-pipe = ConsisIDPipeline.from_pretrained("BestWishYsh/ConsisID-preview", torch_dtype=torch.bfloat16)
-pipe.to("cuda")
-```
-
-## Identity-Preserving Text-to-Video
-对于身份保持的文本到视频生成，需要输入文本提示和包含清晰面部（例如，最好是半身或全身）的图像。默认情况下，ConsisID 会生成 720x480 的视频以获得最佳效果。
-
-```python
-from diffusers.utils import export_to_video
-
-prompt = "The video captures a boy walking along a city street, filmed in black and white on a classic 35mm camera. His expression is thoughtful, his brow slightly furrowed as if he's lost in contemplation. The film grain adds a textured, timeless quality to the image, evoking a sense of nostalgia. Around him, the cityscape is filled with vintage buildings, cobblestone sidewalks, and softly blurred figures passing by, their outlines faint and indistinct. Streetlights cast a gentle glow, while shadows play across the boy's path, adding depth to the scene. The lighting highlights the boy's subtle smile, hinting at a fleeting moment of curiosity. The overall cinematic atmosphere, complete with classic film still aesthetics and dramatic contrasts, gives the scene an evocative and introspective feel."
-image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_input.png?download=true"
-
-id_cond, id_vit_hidden, image, face_kps = process_face_embeddings_infer(face_helper_1, face_clip_model, face_helper_2, eva_transform_mean, eva_transform_std, face_main_model, "cuda", torch.bfloat16, image, is_align_face=True)
-
-video = pipe(image=image, prompt=prompt, num_inference_steps=50, guidance_scale=6.0, use_dynamic_cfg=False, id_vit_hidden=id_vit_hidden, id_cond=id_cond, kps_cond=face_kps, generator=torch.Generator("cuda").manual_seed(42))
-export_to_video(video.frames[0], "output.mp4", fps=8)
-```
-<table>
-  <tr>
-    <th style="text-align: center;">Face Image</th>
-    <th style="text-align: center;">Video</th>
-    <th style="text-align: center;">Description</th
-  </tr>
-  <tr>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_image_0.png?download=true" style="height: auto; width: 600px;"></td>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_output_0.gif?download=true" style="height: auto; width: 2000px;"></td>
-    <td>The video, in a beautifully crafted animated style, features a confident woman riding a horse through a lush forest clearing. Her expression is focused yet serene as she adjusts her wide-brimmed hat with a practiced hand. She wears a flowy bohemian dress, which moves gracefully with the rhythm of the horse, the fabric flowing fluidly in the animated motion. The dappled sunlight filters through the trees, casting soft, painterly patterns on the forest floor. Her posture is poised, showing both control and elegance as she guides the horse with ease. The animation's gentle, fluid style adds a dreamlike quality to the scene, with the woman’s calm demeanor and the peaceful surroundings evoking a sense of freedom and harmony.</td>
-  </tr>
-  <tr>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_image_1.png?download=true" style="height: auto; width: 600px;"></td>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_output_1.gif?download=true" style="height: auto; width: 2000px;"></td>
-    <td>The video, in a captivating animated style, shows a woman standing in the center of a snowy forest, her eyes narrowed in concentration as she extends her hand forward. She is dressed in a deep blue cloak, her breath visible in the cold air, which is rendered with soft, ethereal strokes. A faint smile plays on her lips as she summons a wisp of ice magic, watching with focus as the surrounding trees and ground begin to shimmer and freeze, covered in delicate ice crystals. The animation’s fluid motion brings the magic to life, with the frost spreading outward in intricate, sparkling patterns. The environment is painted with soft, watercolor-like hues, enhancing the magical, dreamlike atmosphere. The overall mood is serene yet powerful, with the quiet winter air amplifying the delicate beauty of the frozen scene.</td>
-  </tr>
-  <tr>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_image_2.png?download=true" style="height: auto; width: 600px;"></td>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_output_2.gif?download=true" style="height: auto; width: 2000px;"></td>
-    <td>The animation features a whimsical portrait of a balloon seller standing in a gentle breeze, captured with soft, hazy brushstrokes that evoke the feel of a serene spring day. His face is framed by a gentle smile, his eyes squinting slightly against the sun, while a few wisps of hair flutter in the wind. He is dressed in a light, pastel-colored shirt, and the balloons around him sway with the wind, adding a sense of playfulness to the scene. The background blurs softly, with hints of a vibrant market or park, enhancing the light-hearted, yet tender mood of the moment.</td>
-  </tr>
-  <tr>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_image_3.png?download=true" style="height: auto; width: 600px;"></td>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_output_3.gif?download=true" style="height: auto; width: 2000px;"></td>
-    <td>The video captures a boy walking along a city street, filmed in black and white on a classic 35mm camera. His expression is thoughtful, his brow slightly furrowed as if he's lost in contemplation. The film grain adds a textured, timeless quality to the image, evoking a sense of nostalgia. Around him, the cityscape is filled with vintage buildings, cobblestone sidewalks, and softly blurred figures passing by, their outlines faint and indistinct. Streetlights cast a gentle glow, while shadows play across the boy's path, adding depth to the scene. The lighting highlights the boy's subtle smile, hinting at a fleeting moment of curiosity. The overall cinematic atmosphere, complete with classic film still aesthetics and dramatic contrasts, gives the scene an evocative and introspective feel.</td>
-  </tr>
-  <tr>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_image_4.png?download=true" style="height: auto; width: 600px;"></td>
-    <td><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/consisid/consisid_output_4.gif?download=true" style="height: auto; width: 2000px;"></td>
-    <td>The video features a baby wearing a bright superhero cape, standing confidently with arms raised in a powerful pose. The baby has a determined look on their face, with eyes wide and lips pursed in concentration, as if ready to take on a challenge. The setting appears playful, with colorful toys scattered around and a soft rug underfoot, while sunlight streams through a nearby window, highlighting the fluttering cape and adding to the impression of heroism. The overall atmosphere is lighthearted and fun, with the baby's expressions capturing a mix of innocence and an adorable attempt at bravery, as if truly ready to save the day.</td>
-  </tr>
-</table>
-
-## Resources
-
-通过以下资源了解有关 ConsisID 的更多信息：
-
- 一段 [视频](https://www.youtube.com/watch?v=PhlgC-bI5SQ) 演示了 ConsisID 的主要功能；
- 有关更多详细信息，请参阅研究论文 [Identity-Preserving Text-to-Video Generation by Frequency Decomposition](https://hf.co/papers/2411.17440)。
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team.
+# Copyright 2024 The HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -77,7 +77,6 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif
 PIXART-α Controlnet pipeline | Implementation of the controlnet model for pixart alpha and its diffusers pipeline | [PIXART-α Controlnet pipeline](#pixart-α-controlnet-pipeline) | - | [Raul Ciotescu](https://github.com/raulc0399/) |
 | HunyuanDiT Differential Diffusion Pipeline | Applies [Differential Diffusion](https://github.com/exx8/differential-diffusion) to [HunyuanDiT](https://github.com/huggingface/diffusers/pull/8240). | [HunyuanDiT with Differential Diffusion](#hunyuandit-with-differential-diffusion) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1v44a5fpzyr4Ffr4v2XBQ7BajzG874N4P?usp=sharing) | [Monjoy Choudhury](https://github.com/MnCSSJ4x) |
 | [🪆Matryoshka Diffusion Models](https://huggingface.co/papers/2310.15111) | A diffusion process that denoises inputs at multiple resolutions jointly and uses a NestedUNet architecture where features and parameters for small scale inputs are nested within those of the large scales. See [original codebase](https://github.com/apple/ml-mdm). | [🪆Matryoshka Diffusion Models](#matryoshka-diffusion-models) | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/pcuenq/mdm) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/tolgacangoz/1f54875fc7aeaabcf284ebde64820966/matryoshka_hf.ipynb) | [M. Tolga Cangöz](https://github.com/tolgacangoz) |
-| Stable Diffusion XL Attentive Eraser Pipeline |[[AAAI2025 Oral] Attentive Eraser](https://github.com/Anonym0u3/AttentiveEraser) is a novel tuning-free method that enhances object removal capabilities in pre-trained diffusion models.|[Stable Diffusion XL Attentive Eraser Pipeline](#stable-diffusion-xl-attentive-eraser-pipeline)|-|[Wenhao Sun](https://github.com/Anonym0u3) and [Benlei Cui](https://github.com/Benny079)|

 To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.

@@ -4586,8 +4585,8 @@ image = pipe(
 ```

 | ![Gradient](https://github.com/user-attachments/assets/e38ce4d5-1ae6-4df0-ab43-adc1b45716b5) | ![Input](https://github.com/user-attachments/assets/9c95679c-e9d7-4f5a-90d6-560203acd6b3) | ![Output](https://github.com/user-attachments/assets/5313ff64-a0c4-418b-8b55-a38f1a5e7532) |
-| -------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------ |
-| Gradient                                                                                     | Input                                                                                     | Output                                                                                     |
+| ------------------------------------------------------------------------------------------ | --------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------- |
+| Gradient                                                                                   | Input                                                                                   | Output                                                                                   |

 A colab notebook demonstrating all results can be found [here](https://colab.research.google.com/drive/1v44a5fpzyr4Ffr4v2XBQ7BajzG874N4P?usp=sharing). Depth Maps have also been added in the same colab.

@@ -4635,93 +4634,6 @@ make_image_grid(image, rows=1, cols=len(image))
 # 50+, 100+, and 250+ num_inference_steps are recommended for nesting levels 0, 1, and 2 respectively.
 ```

-### Stable Diffusion XL Attentive Eraser Pipeline
-<img src="https://raw.githubusercontent.com/Anonym0u3/Images/refs/heads/main/fenmian.png"  width="600" />
-
-**Stable Diffusion XL Attentive Eraser Pipeline** is an advanced object removal pipeline that leverages SDXL for precise content suppression and seamless region completion. This pipeline uses **self-attention redirection guidance** to modify the model’s self-attention mechanism, allowing for effective removal and inpainting across various levels of mask precision, including semantic segmentation masks, bounding boxes, and hand-drawn masks. If you are interested in more detailed information and have any questions, please refer to the [paper](https://arxiv.org/abs/2412.12974) and [official implementation](https://github.com/Anonym0u3/AttentiveEraser).
-
-#### Key features
-
- **Tuning-Free**: No additional training is required, making it easy to integrate and use.
- **Flexible Mask Support**: Works with different types of masks for targeted object removal.
- **High-Quality Results**: Utilizes the inherent generative power of diffusion models for realistic content completion.
-
-#### Usage example
-To use the Stable Diffusion XL Attentive Eraser Pipeline, you can initialize it as follows:
-```py
-import torch
-from diffusers import DDIMScheduler, DiffusionPipeline
-from diffusers.utils import load_image
-import torch.nn.functional as F
-from torchvision.transforms.functional import to_tensor, gaussian_blur
-
-dtype = torch.float16
-device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu") 
-
-scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
-pipeline = DiffusionPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    custom_pipeline="pipeline_stable_diffusion_xl_attentive_eraser",
-    scheduler=scheduler,
-    variant="fp16",
-    use_safetensors=True,
-    torch_dtype=dtype,
-).to(device)
-
-
-def preprocess_image(image_path, device):
-    image = to_tensor((load_image(image_path)))
-    image = image.unsqueeze_(0).float() * 2 - 1 # [0,1] --> [-1,1]
-    if image.shape[1] != 3:
-        image = image.expand(-1, 3, -1, -1)
-        image = F.interpolate(image, (1024, 1024))
-        image = image.to(dtype).to(device)
-        return image
-
-def preprocess_mask(mask_path, device):
-    mask = to_tensor((load_image(mask_path, convert_method=lambda img: img.convert('L'))))
-    mask = mask.unsqueeze_(0).float()  # 0 or 1
-    mask = F.interpolate(mask, (1024, 1024))
-    mask = gaussian_blur(mask, kernel_size=(77, 77))
-    mask[mask < 0.1] = 0
-    mask[mask >= 0.1] = 1
-    mask = mask.to(dtype).to(device)
-    return mask
-
-prompt = "" # Set prompt to null
-seed=123 
-generator = torch.Generator(device=device).manual_seed(seed)
-source_image_path = "https://raw.githubusercontent.com/Anonym0u3/Images/refs/heads/main/an1024.png"
-mask_path = "https://raw.githubusercontent.com/Anonym0u3/Images/refs/heads/main/an1024_mask.png"
-source_image = preprocess_image(source_image_path, device)
-mask = preprocess_mask(mask_path, device)
-
-image = pipeline(
-    prompt=prompt, 
-    image=source_image,
-    mask_image=mask,
-    height=1024,
-    width=1024,
-    AAS=True, # enable AAS
-    strength=0.8, # inpainting strength
-    rm_guidance_scale=9, # removal guidance scale
-    ss_steps = 9, # similarity suppression steps
-    ss_scale = 0.3, # similarity suppression scale
-    AAS_start_step=0, # AAS start step
-    AAS_start_layer=34, # AAS start layer
-    AAS_end_layer=70, # AAS end layer
-    num_inference_steps=50, # number of inference steps # AAS_end_step = int(strength*num_inference_steps)
-    generator=generator,
-    guidance_scale=1,
-).images[0]
-image.save('./removed_img.png')
-print("Object removal completed")
-```
-
-| Source Image                                                                                   | Mask                                                                                        | Output                                                                                              |
-| ---------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------- | --------------------------------------------------------------------------------------------------- |
-| ![Source Image](https://raw.githubusercontent.com/Anonym0u3/Images/refs/heads/main/an1024.png) | ![Mask](https://raw.githubusercontent.com/Anonym0u3/Images/refs/heads/main/an1024_mask.png) | ![Output](https://raw.githubusercontent.com/Anonym0u3/Images/refs/heads/main/AE_step40_layer34.png) |
-
 # Perturbed-Attention Guidance

 [Project](https://ku-cvlab.github.io/Perturbed-Attention-Guidance/) / [arXiv](https://arxiv.org/abs/2403.17377) / [GitHub](https://github.com/KU-CVLAB/Perturbed-Attention-Guidance)
@@ -404,11 +404,10 @@ def my_forward(
            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
            # This would be a good case for the `match` statement (Python 3.10+)
            is_mps = sample.device.type == "mps"
-            is_npu = sample.device.type == "npu"
            if isinstance(timestep, float):
-                dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+                dtype = torch.float32 if is_mps else torch.float64
            else:
-                dtype = torch.int32 if (is_mps or is_npu) else torch.int64
+                dtype = torch.int32 if is_mps else torch.int64
            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
        elif len(timesteps.shape) == 0:
            timesteps = timesteps[None].to(sample.device)
@@ -80,6 +80,7 @@ from diffusers.utils import (
    USE_PEFT_BACKEND,
    BaseOutput,
    deprecate,
+    is_torch_version,
    is_torch_xla_available,
    logging,
    replace_example_docstring,
@@ -868,7 +869,23 @@ class CrossAttnDownBlock2D(nn.Module):

        for i, (resnet, attn) in enumerate(blocks):
            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                hidden_states = self._gradient_checkpointing_func(resnet, hidden_states, temb)
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
                hidden_states = attn(
                    hidden_states,
                    encoder_hidden_states=encoder_hidden_states,
@@ -1013,6 +1030,17 @@ class UNetMidBlock2DCrossAttn(nn.Module):
        hidden_states = self.resnets[0](hidden_states, temb)
        for attn, resnet in zip(self.attentions, self.resnets[1:]):
            if torch.is_grad_enabled() and self.gradient_checkpointing:
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
                hidden_states = attn(
                    hidden_states,
                    encoder_hidden_states=encoder_hidden_states,
@@ -1021,7 +1049,12 @@ class UNetMidBlock2DCrossAttn(nn.Module):
                    encoder_attention_mask=encoder_attention_mask,
                    return_dict=False,
                )[0]
-                hidden_states = self._gradient_checkpointing_func(resnet, hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
            else:
                hidden_states = attn(
                    hidden_states,
@@ -1159,7 +1192,23 @@ class CrossAttnUpBlock2D(nn.Module):
            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)

            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                hidden_states = self._gradient_checkpointing_func(resnet, hidden_states, temb)
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(resnet),
+                    hidden_states,
+                    temb,
+                    **ckpt_kwargs,
+                )
                hidden_states = attn(
                    hidden_states,
                    encoder_hidden_states=encoder_hidden_states,
@@ -1233,6 +1282,10 @@ class MatryoshkaTransformer2DModel(LegacyModelMixin, LegacyConfigMixin):
            ]
        )

+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -1312,8 +1365,19 @@ class MatryoshkaTransformer2DModel(LegacyModelMixin, LegacyConfigMixin):
        # Blocks
        for block in self.transformer_blocks:
            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                hidden_states = self._gradient_checkpointing_func(
-                    block,
+
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
                    hidden_states,
                    attention_mask,
                    encoder_hidden_states,
@@ -1321,6 +1385,7 @@ class MatryoshkaTransformer2DModel(LegacyModelMixin, LegacyConfigMixin):
                    timestep,
                    cross_attention_kwargs,
                    class_labels,
+                    **ckpt_kwargs,
                )
            else:
                hidden_states = block(
@@ -2659,6 +2724,10 @@ class MatryoshkaUNet2DConditionModel(
        for module in self.children():
            fn_recursive_set_attention_slice(module, reversed_slice_size)

+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
    def enable_freeu(self, s1: float, s2: float, b1: float, b2: float):
        r"""Enables the FreeU mechanism from https://arxiv.org/abs/2309.11497.

@@ -2737,11 +2806,10 @@ class MatryoshkaUNet2DConditionModel(
            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
            # This would be a good case for the `match` statement (Python 3.10+)
            is_mps = sample.device.type == "mps"
-            is_npu = sample.device.type == "npu"
            if isinstance(timestep, float):
-                dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+                dtype = torch.float32 if is_mps else torch.float64
            else:
-                dtype = torch.int32 if (is_mps or is_npu) else torch.int64
+                dtype = torch.int32 if is_mps else torch.int64
            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
        elif len(timesteps.shape) == 0:
            timesteps = timesteps[None].to(sample.device)
@@ -1,5 +1,5 @@
 #
-# Copyright 2025 The HuggingFace Inc. team.
+# Copyright 2024 The HuggingFace Inc. team.
 # SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -1,5 +1,5 @@
 #
-# Copyright 2025 The HuggingFace Inc. team.
+# Copyright 2024 The HuggingFace Inc. team.
 # SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -1,5 +1,5 @@
 #
-# Copyright 2025 The HuggingFace Inc. team.
+# Copyright 2024 The HuggingFace Inc. team.
 # SPDX-FileCopyrightText: Copyright (c) 1993-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -193,8 +193,7 @@ class StableDiffusionXLControlNetReferencePipeline(StableDiffusionXLControlNetPi

    def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do_classifier_free_guidance):
        refimage = refimage.to(device=device)
-        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
-        if needs_upcasting:
+        if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
            self.upcast_vae()
            refimage = refimage.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
        if refimage.dtype != self.vae.dtype:
@@ -224,11 +223,6 @@ class StableDiffusionXLControlNetReferencePipeline(StableDiffusionXLControlNetPi

        # aligning device to prevent device errors when concating it with the latent model input
        ref_image_latents = ref_image_latents.to(device=device, dtype=dtype)
-
-        # cast back to fp16 if needed
-        if needs_upcasting:
-            self.vae.to(dtype=torch.float16)
-
        return ref_image_latents

    def prepare_ref_image(
@@ -139,8 +139,7 @@ def retrieve_timesteps(
 class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
    def prepare_ref_latents(self, refimage, batch_size, dtype, device, generator, do_classifier_free_guidance):
        refimage = refimage.to(device=device)
-        needs_upcasting = self.vae.dtype == torch.float16 and self.vae.config.force_upcast
-        if needs_upcasting:
+        if self.vae.dtype == torch.float16 and self.vae.config.force_upcast:
            self.upcast_vae()
            refimage = refimage.to(next(iter(self.vae.post_quant_conv.parameters())).dtype)
        if refimage.dtype != self.vae.dtype:
@@ -170,11 +169,6 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):

        # aligning device to prevent device errors when concating it with the latent model input
        ref_image_latents = ref_image_latents.to(device=device, dtype=dtype)
-
-        # cast back to fp16 if needed
-        if needs_upcasting:
-            self.vae.to(dtype=torch.float16)
-
        return ref_image_latents

    def prepare_ref_image(
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -742,29 +742,3 @@ accelerate launch train_dreambooth.py \
 ## Stable Diffusion XL

 We support fine-tuning of the UNet shipped in [Stable Diffusion XL](https://huggingface.co/papers/2307.01952) with DreamBooth and LoRA via the `train_dreambooth_lora_sdxl.py` script. Please refer to the docs [here](./README_sdxl.md).
-
-## Dataset
-
-We support 🤗 [Datasets](https://huggingface.co/docs/datasets/index), you can find a dataset on the [Hugging Face Hub](https://huggingface.co/datasets) or use your own.
-
-The quickest way to get started with your custom dataset is 🤗 Datasets' [`ImageFolder`](https://huggingface.co/docs/datasets/image_dataset#imagefolder).
-
-We need to create a file `metadata.jsonl` in the directory with our images:
-
-```
-{"file_name": "01.jpg", "prompt": "prompt 01"}
-{"file_name": "02.jpg", "prompt": "prompt 02"}
-```
-
-If we have a directory with image-text pairs e.g. `01.jpg` and `01.txt` then `convert_to_imagefolder.py` can create `metadata.jsonl`.
-
-```sh
-python convert_to_imagefolder.py --path my_dataset/
-```
-
-We use `--dataset_name` and `--caption_column` with training scripts.
-
-```
--dataset_name=my_dataset/
--caption_column=prompt
-```
@@ -1,32 +0,0 @@
-import argparse
-import json
-import pathlib
-
-
-parser = argparse.ArgumentParser()
-parser.add_argument(
-    "--path",
-    type=str,
-    required=True,
-    help="Path to folder with image-text pairs.",
-)
-parser.add_argument("--caption_column", type=str, default="prompt", help="Name of caption column.")
-args = parser.parse_args()
-
-path = pathlib.Path(args.path)
-if not path.exists():
-    raise RuntimeError(f"`--path` '{args.path}' does not exist.")
-
-all_files = list(path.glob("*"))
-captions = list(path.glob("*.txt"))
-images = set(all_files) - set(captions)
-images = {image.stem: image for image in images}
-caption_image = {caption: images.get(caption.stem) for caption in captions if images.get(caption.stem)}
-
-metadata = path.joinpath("metadata.jsonl")
-
-with metadata.open("w", encoding="utf-8") as f:
-    for caption, image in caption_image.items():
-        caption_text = caption.read_text(encoding="utf-8")
-        json.dump({"file_name": image.name, args.caption_column: caption_text}, f)
-        f.write("\n")
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1716,9 +1716,9 @@ def main(args):
                pipeline = FluxPipeline.from_pretrained(
                    args.pretrained_model_name_or_path,
                    vae=vae,
-                    text_encoder=accelerator.unwrap_model(text_encoder_one, keep_fp32_wrapper=False),
-                    text_encoder_2=accelerator.unwrap_model(text_encoder_two, keep_fp32_wrapper=False),
-                    transformer=accelerator.unwrap_model(transformer, keep_fp32_wrapper=False),
+                    text_encoder=accelerator.unwrap_model(text_encoder_one),
+                    text_encoder_2=accelerator.unwrap_model(text_encoder_two),
+                    transformer=accelerator.unwrap_model(transformer),
                    revision=args.revision,
                    variant=args.variant,
                    torch_dtype=weight_dtype,
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -54,11 +54,7 @@ from diffusers import (
 )
 from diffusers.loaders import StableDiffusionLoraLoaderMixin
 from diffusers.optimization import get_scheduler
-from diffusers.training_utils import (
-    _set_state_dict_into_text_encoder,
-    cast_training_params,
-    free_memory,
-)
+from diffusers.training_utils import _set_state_dict_into_text_encoder, cast_training_params
 from diffusers.utils import (
    check_min_version,
    convert_state_dict_to_diffusers,
@@ -155,14 +151,14 @@ def log_validation(
    if args.validation_images is None:
        images = []
        for _ in range(args.num_validation_images):
-            with torch.amp.autocast(accelerator.device.type):
+            with torch.cuda.amp.autocast():
                image = pipeline(**pipeline_args, generator=generator).images[0]
                images.append(image)
    else:
        images = []
        for image in args.validation_images:
            image = Image.open(image)
-            with torch.amp.autocast(accelerator.device.type):
+            with torch.cuda.amp.autocast():
                image = pipeline(**pipeline_args, image=image, generator=generator).images[0]
            images.append(image)

@@ -181,7 +177,7 @@ def log_validation(
            )

    del pipeline
-    free_memory()
+    torch.cuda.empty_cache()

    return images

@@ -797,7 +793,7 @@ def main(args):
        cur_class_images = len(list(class_images_dir.iterdir()))

        if cur_class_images < args.num_class_images:
-            torch_dtype = torch.float16 if accelerator.device.type in ("cuda", "xpu") else torch.float32
+            torch_dtype = torch.float16 if accelerator.device.type == "cuda" else torch.float32
            if args.prior_generation_precision == "fp32":
                torch_dtype = torch.float32
            elif args.prior_generation_precision == "fp16":
@@ -833,7 +829,8 @@ def main(args):
                    image.save(image_filename)

            del pipeline
-            free_memory()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()

    # Handle the repository creation
    if accelerator.is_main_process:
@@ -1088,7 +1085,7 @@ def main(args):
        tokenizer = None

        gc.collect()
-        free_memory()
+        torch.cuda.empty_cache()
    else:
        pre_computed_encoder_hidden_states = None
        validation_prompt_encoder_hidden_states = None
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -63,7 +63,6 @@ from diffusers.utils import (
    is_wandb_available,
 )
 from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
-from diffusers.utils.import_utils import is_torch_npu_available
 from diffusers.utils.torch_utils import is_compiled_module


@@ -75,9 +74,6 @@ check_min_version("0.33.0.dev0")

 logger = get_logger(__name__)

-if is_torch_npu_available():
-    torch.npu.config.allow_internal_format = False
-

 def save_model_card(
    repo_id: str,
@@ -162,9 +158,6 @@ def log_validation(
        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
        f" {args.validation_prompt}."
    )
-    if args.enable_vae_tiling:
-        pipeline.vae.enable_tiling(tile_sample_min_height=1024, tile_sample_stride_width=1024)
-
    pipeline.text_encoder = pipeline.text_encoder.to(torch.bfloat16)
    pipeline = pipeline.to(accelerator.device)
    pipeline.set_progress_bar_config(disable=True)
@@ -604,8 +597,6 @@ def parse_args(input_args=None):
        help="Whether to offload the VAE and the text encoder to CPU when they are not used.",
    )
    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
-    parser.add_argument("--enable_vae_tiling", action="store_true", help="Enabla vae tiling in log validation")
-    parser.add_argument("--enable_npu_flash_attention", action="store_true", help="Enabla Flash Attention for NPU")

    if input_args is not None:
        args = parser.parse_args(input_args)
@@ -929,7 +920,8 @@ def main(args):
                    image.save(image_filename)

            del pipeline
-            free_memory()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()

    # Handle the repository creation
    if accelerator.is_main_process:
@@ -992,13 +984,6 @@ def main(args):
    # because Gemma2 is particularly suited for bfloat16.
    text_encoder.to(dtype=torch.bfloat16)

-    if args.enable_npu_flash_attention:
-        if is_torch_npu_available():
-            logger.info("npu flash attention enabled.")
-            transformer.enable_npu_flash_attention()
-        else:
-            raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu device ")
-
    # Initialize a text encoding pipeline and keep it to CPU for now.
    text_encoding_pipeline = SanaPipeline.from_pretrained(
        args.pretrained_model_name_or_path,
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -812,8 +812,6 @@ def main(args):
        for name, module in flux_transformer.named_modules():
            if "transformer_blocks" in name:
                module.requires_grad_(True)
-            else:
-                module.requirs_grad_(False)

    def unwrap_model(model):
        model = accelerator.unwrap_model(model)
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,59 +0,0 @@
-# AutoencoderKL training example
-
-## Installing the dependencies
-
-Before running the scripts, make sure to install the library's training dependencies:
-
-**Important**
-
-To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
-```bash
-git clone https://github.com/huggingface/diffusers
-cd diffusers
-pip install .
-```
-
-Then cd in the example folder  and run
-```bash
-pip install -r requirements.txt
-```
-
-
-And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
-
-```bash
-accelerate config
-```
-
-## Training on CIFAR10
-
-Please replace the validation image with your own image.
-
-```bash
-accelerate launch train_autoencoderkl.py \
-    --pretrained_model_name_or_path stabilityai/sd-vae-ft-mse \
-    --dataset_name=cifar10 \
-    --image_column=img \
-    --validation_image images/bird.jpg images/car.jpg images/dog.jpg images/frog.jpg \
-    --num_train_epochs 100 \
-    --gradient_accumulation_steps 2 \
-    --learning_rate 4.5e-6 \
-    --lr_scheduler cosine \
-    --report_to wandb \
-```
-
-## Training on ImageNet
-
-```bash
-accelerate launch train_autoencoderkl.py \
-    --pretrained_model_name_or_path stabilityai/sd-vae-ft-mse \
-    --num_train_epochs 100 \
-    --gradient_accumulation_steps 2 \
-    --learning_rate 4.5e-6 \
-    --lr_scheduler cosine \
-    --report_to wandb \
-    --mixed_precision bf16 \
-    --train_data_dir /path/to/ImageNet/train \
-    --validation_image ./image.png \
-    --decoder_only
-```
@@ -1,15 +0,0 @@
-accelerate>=0.16.0
-bitsandbytes
-datasets
-huggingface_hub
-lpips
-numpy
-packaging
-Pillow
-taming_transformers
-torch
-torchvision
-tqdm
-transformers
-wandb
-xformers
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -8,6 +8,7 @@ from diffusers.models import PixArtTransformer2DModel
 from diffusers.models.attention import BasicTransformerBlock
 from diffusers.models.modeling_outputs import Transformer2DModelOutput
 from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils.torch_utils import is_torch_version


 class PixArtControlNetAdapterBlock(nn.Module):
@@ -150,6 +151,10 @@ class PixArtControlNetTransformerModel(ModelMixin, ConfigMixin):
        self.transformer = transformer
        self.controlnet = controlnet

+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+
    def forward(
        self,
        hidden_states: torch.Tensor,
@@ -215,8 +220,18 @@ class PixArtControlNetTransformerModel(ModelMixin, ConfigMixin):
                print("Gradient checkpointing is not supported for the controlnet transformer model, yet.")
                exit(1)

-                hidden_states = self._gradient_checkpointing_func(
-                    block,
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+
+                    return custom_forward
+
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
                    hidden_states,
                    attention_mask,
                    encoder_hidden_states,
@@ -224,6 +239,7 @@ class PixArtControlNetTransformerModel(ModelMixin, ConfigMixin):
                    timestep,
                    cross_attention_kwargs,
                    None,
+                    **ckpt_kwargs,
                )
            else:
                # the control nets are only used for the blocks 1 to self.blocks_num
@@ -1031,11 +1031,10 @@ class PixArtAlphaControlnetPipeline(DiffusionPipeline):
                    # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
                    # This would be a good case for the `match` statement (Python 3.10+)
                    is_mps = latent_model_input.device.type == "mps"
-                    is_npu = latent_model_input.device.type == "npu"
                    if isinstance(current_timestep, float):
-                        dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+                        dtype = torch.float32 if is_mps else torch.float64
                    else:
-                        dtype = torch.int32 if (is_mps or is_npu) else torch.int64
+                        dtype = torch.int32 if is_mps else torch.int64
                    current_timestep = torch.tensor([current_timestep], dtype=dtype, device=latent_model_input.device)
                elif len(current_timestep.shape) == 0:
                    current_timestep = current_timestep[None].to(latent_model_input.device)
@@ -258,11 +258,10 @@ class PromptDiffusionControlNetModel(ControlNetModel):
            # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can
            # This would be a good case for the `match` statement (Python 3.10+)
            is_mps = sample.device.type == "mps"
-            is_npu = sample.device.type == "npu"
            if isinstance(timestep, float):
-                dtype = torch.float32 if (is_mps or is_npu) else torch.float64
+                dtype = torch.float32 if is_mps else torch.float64
            else:
-                dtype = torch.int32 if (is_mps or is_npu) else torch.int64
+                dtype = torch.int32 if is_mps else torch.int64
            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
        elif len(timesteps.shape) == 0:
            timesteps = timesteps[None].to(sample.device)
@@ -1,100 +0,0 @@
-# Generating images using Flux and PyTorch/XLA
-
-The `flux_inference` script shows how to do image generation using Flux on TPU devices using PyTorch/XLA. It uses the pallas kernel for flash attention for faster generation.
-
-It has been tested on [Trillium](https://cloud.google.com/blog/products/compute/introducing-trillium-6th-gen-tpus) TPU versions. No other TPU types have been tested.
-
-## Create TPU
-
-To create a TPU on Google Cloud, follow [this guide](https://cloud.google.com/tpu/docs/v6e)
-
-## Setup TPU environment
-
-SSH into the VM and install Pytorch, Pytorch/XLA
-
-```bash
-pip install torch~=2.5.0 torch_xla[tpu]~=2.5.0 -f https://storage.googleapis.com/libtpu-releases/index.html -f https://storage.googleapis.com/libtpu-wheels/index.html
-pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
-```
-
-Verify that PyTorch and PyTorch/XLA were installed correctly:
-
-```bash
-python3 -c "import torch; import torch_xla;"
-```
-
-Install dependencies
-
-```bash
-pip install transformers accelerate sentencepiece structlog
-pushd ../../..
-pip install .
-popd
-```
-
-## Run the inference job
-
-### Authenticate
-
-Run the following command to authenticate your token in order to download Flux weights.
-
-```bash
-huggingface-cli login
-```
-
-Then run:
-
-```bash
-python flux_inference.py
-```
-
-The script loads the text encoders onto the CPU and the Flux transformer and VAE models onto the TPU. The first time the script runs, the compilation time is longer, while the cache stores the compiled programs. On subsequent runs, compilation is much faster and the subsequent passes being the fastest. 
-
-On a Trillium v6e-4, you should expect ~9 sec / 4 images or 2.25 sec / image (as devices run generation in parallel):
-
-```bash
-WARNING:root:libtpu.so and TPU device found. Setting PJRT_DEVICE=TPU.
-Loading checkpoint shards: 100%|███████████████████████████████| 2/2 [00:00<00:00,  7.01it/s]
-Loading pipeline components...:  40%|██████████▍               | 2/5 [00:00<00:00,  3.78it/s]You set `add_prefix_space`. The tokenizer needs to be converted from the slow tokenizers
-Loading pipeline components...: 100%|██████████████████████████| 5/5 [00:00<00:00,  6.72it/s]
-2025-01-10 00:51:25 [info     ] loading flux from black-forest-labs/FLUX.1-dev
-2025-01-10 00:51:25 [info     ] loading flux from black-forest-labs/FLUX.1-dev
-2025-01-10 00:51:26 [info     ] loading flux from black-forest-labs/FLUX.1-dev
-2025-01-10 00:51:26 [info     ] loading flux from black-forest-labs/FLUX.1-dev
-Loading pipeline components...: 100%|██████████████████████████| 3/3 [00:00<00:00,  4.29it/s]
-Loading pipeline components...: 100%|██████████████████████████| 3/3 [00:00<00:00,  3.26it/s]
-Loading pipeline components...: 100%|██████████████████████████| 3/3 [00:00<00:00,  3.27it/s]
-Loading pipeline components...: 100%|██████████████████████████| 3/3 [00:00<00:00,  3.25it/s]
-2025-01-10 00:51:34 [info     ] starting compilation run...   
-2025-01-10 00:51:35 [info     ] starting compilation run...   
-2025-01-10 00:51:37 [info     ] starting compilation run...   
-2025-01-10 00:51:37 [info     ] starting compilation run...   
-2025-01-10 00:52:52 [info     ] compilation took 78.5155531649998 sec.
-2025-01-10 00:52:53 [info     ] starting inference run...     
-2025-01-10 00:52:57 [info     ] compilation took 79.52986721400157 sec.
-2025-01-10 00:52:57 [info     ] compilation took 81.91776501700042 sec.
-2025-01-10 00:52:57 [info     ] compilation took 80.24951512600092 sec.
-2025-01-10 00:52:57 [info     ] starting inference run...     
-2025-01-10 00:52:57 [info     ] starting inference run...     
-2025-01-10 00:52:58 [info     ] starting inference run...     
-2025-01-10 00:53:22 [info     ] inference time: 25.112665320000815
-2025-01-10 00:53:30 [info     ] inference time: 7.7019307739992655
-2025-01-10 00:53:38 [info     ] inference time: 7.693858365000779
-2025-01-10 00:53:46 [info     ] inference time: 7.690621814001133
-2025-01-10 00:53:53 [info     ] inference time: 7.679490454000188
-2025-01-10 00:54:01 [info     ] inference time: 7.68949568500102
-2025-01-10 00:54:09 [info     ] inference time: 7.686633744000574
-2025-01-10 00:54:16 [info     ] inference time: 7.696786873999372
-2025-01-10 00:54:24 [info     ] inference time: 7.691988694999964
-2025-01-10 00:54:32 [info     ] inference time: 7.700649563999832
-2025-01-10 00:54:39 [info     ] inference time: 7.684993574001055
-2025-01-10 00:54:47 [info     ] inference time: 7.68343457499941
-2025-01-10 00:54:55 [info     ] inference time: 7.667921153999487
-2025-01-10 00:55:02 [info     ] inference time: 7.683585194001353
-2025-01-10 00:55:06 [info     ] avg. inference over 15 iterations took 8.61202360273334 sec.
-2025-01-10 00:55:07 [info     ] avg. inference over 15 iterations took 8.952725123600006 sec.
-2025-01-10 00:55:10 [info     ] inference time: 7.673799695001435
-2025-01-10 00:55:10 [info     ] avg. inference over 15 iterations took 8.849190365400379 sec.
-2025-01-10 00:55:10 [info     ] saved metric information as /tmp/metrics_report.txt
-2025-01-10 00:55:12 [info     ] avg. inference over 15 iterations took 8.940161458400205 sec.
-```
@@ -1,120 +0,0 @@
-from argparse import ArgumentParser
-from pathlib import Path
-from time import perf_counter
-
-import structlog
-import torch
-import torch_xla.core.xla_model as xm
-import torch_xla.debug.metrics as met
-import torch_xla.debug.profiler as xp
-import torch_xla.distributed.xla_multiprocessing as xmp
-import torch_xla.runtime as xr
-
-from diffusers import FluxPipeline
-
-
-logger = structlog.get_logger()
-metrics_filepath = "/tmp/metrics_report.txt"
-
-
-def _main(index, args, text_pipe, ckpt_id):
-    cache_path = Path("/tmp/data/compiler_cache_tRiLlium_eXp")
-    cache_path.mkdir(parents=True, exist_ok=True)
-    xr.initialize_cache(str(cache_path), readonly=False)
-
-    profile_path = Path("/tmp/data/profiler_out_tRiLlium_eXp")
-    profile_path.mkdir(parents=True, exist_ok=True)
-    profiler_port = 9012
-    profile_duration = args.profile_duration
-    if args.profile:
-        logger.info(f"starting profiler on port {profiler_port}")
-        _ = xp.start_server(profiler_port)
-    device0 = xm.xla_device()
-
-    logger.info(f"loading flux from {ckpt_id}")
-    flux_pipe = FluxPipeline.from_pretrained(
-        ckpt_id, text_encoder=None, tokenizer=None, text_encoder_2=None, tokenizer_2=None, torch_dtype=torch.bfloat16
-    ).to(device0)
-    flux_pipe.transformer.enable_xla_flash_attention(partition_spec=("data", None, None, None), is_flux=True)
-
-    prompt = "photograph of an electronics chip in the shape of a race car with trillium written on its side"
-    width = args.width
-    height = args.height
-    guidance = args.guidance
-    n_steps = 4 if args.schnell else 28
-
-    logger.info("starting compilation run...")
-    ts = perf_counter()
-    with torch.no_grad():
-        prompt_embeds, pooled_prompt_embeds, text_ids = text_pipe.encode_prompt(
-            prompt=prompt, prompt_2=None, max_sequence_length=512
-        )
-    prompt_embeds = prompt_embeds.to(device0)
-    pooled_prompt_embeds = pooled_prompt_embeds.to(device0)
-
-    image = flux_pipe(
-        prompt_embeds=prompt_embeds,
-        pooled_prompt_embeds=pooled_prompt_embeds,
-        num_inference_steps=28,
-        guidance_scale=guidance,
-        height=height,
-        width=width,
-    ).images[0]
-    logger.info(f"compilation took {perf_counter() - ts} sec.")
-    image.save("/tmp/compile_out.png")
-
-    base_seed = 4096 if args.seed is None else args.seed
-    seed_range = 1000
-    unique_seed = base_seed + index * seed_range
-    xm.set_rng_state(seed=unique_seed, device=device0)
-    times = []
-    logger.info("starting inference run...")
-    for _ in range(args.itters):
-        ts = perf_counter()
-        with torch.no_grad():
-            prompt_embeds, pooled_prompt_embeds, text_ids = text_pipe.encode_prompt(
-                prompt=prompt, prompt_2=None, max_sequence_length=512
-            )
-        prompt_embeds = prompt_embeds.to(device0)
-        pooled_prompt_embeds = pooled_prompt_embeds.to(device0)
-
-        if args.profile:
-            xp.trace_detached(f"localhost:{profiler_port}", str(profile_path), duration_ms=profile_duration)
-        image = flux_pipe(
-            prompt_embeds=prompt_embeds,
-            pooled_prompt_embeds=pooled_prompt_embeds,
-            num_inference_steps=n_steps,
-            guidance_scale=guidance,
-            height=height,
-            width=width,
-        ).images[0]
-        inference_time = perf_counter() - ts
-        if index == 0:
-            logger.info(f"inference time: {inference_time}")
-        times.append(inference_time)
-    logger.info(f"avg. inference over {args.itters} iterations took {sum(times)/len(times)} sec.")
-    image.save(f"/tmp/inference_out-{index}.png")
-    if index == 0:
-        metrics_report = met.metrics_report()
-        with open(metrics_filepath, "w+") as fout:
-            fout.write(metrics_report)
-        logger.info(f"saved metric information as {metrics_filepath}")
-
-
-if __name__ == "__main__":
-    parser = ArgumentParser()
-    parser.add_argument("--schnell", action="store_true", help="run flux schnell instead of dev")
-    parser.add_argument("--width", type=int, default=1024, help="width of the image to generate")
-    parser.add_argument("--height", type=int, default=1024, help="height of the image to generate")
-    parser.add_argument("--guidance", type=float, default=3.5, help="gauidance strentgh for dev")
-    parser.add_argument("--seed", type=int, default=None, help="seed for inference")
-    parser.add_argument("--profile", action="store_true", help="enable profiling")
-    parser.add_argument("--profile-duration", type=int, default=10000, help="duration for profiling in msec.")
-    parser.add_argument("--itters", type=int, default=15, help="tiems to run inference and get avg time in sec.")
-    args = parser.parse_args()
-    if args.schnell:
-        ckpt_id = "black-forest-labs/FLUX.1-schnell"
-    else:
-        ckpt_id = "black-forest-labs/FLUX.1-dev"
-    text_pipe = FluxPipeline.from_pretrained(ckpt_id, transformer=None, vae=None, torch_dtype=torch.bfloat16).to("cpu")
-    xmp.spawn(_main, args=(args, text_pipe, ckpt_id))
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -515,6 +515,10 @@ def main():
    elif accelerator.mixed_precision == "bf16":
        weight_dtype = torch.bfloat16

+    # Freeze the unet parameters before adding adapters
+    for param in unet.parameters():
+        param.requires_grad_(False)
+
    unet_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 # coding=utf-8
-# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/Show More
+++ b/Show More