remove print

first draft
Update src/diffusers/pipelines/pipeline_utils.py
2024-12-04 13:04:48 +00:00 · 2024-12-04 13:03:35 +00:00 · 2024-12-04 13:57:21 +01:00 · 2024-12-04 13:56:21 +01:00 · 2024-12-04 17:34:58 +05:30 · 2024-12-03 14:06:47 +00:00
279 changed files with 1384 additions and 42279 deletions
@@ -238,13 +238,12 @@ jobs:

  run_flax_tpu_tests:
    name: Nightly Flax TPU Tests
-    runs-on:
-      group: gcp-ct5lp-hightpu-8t
+    runs-on: docker-tpu
    if: github.event_name == 'schedule'

    container:
      image: diffusers/diffusers-flax-tpu
-      options: --shm-size "16gb" --ipc host --privileged ${{ vars.V5_LITEPOD_8_ENV}} -v /mnt/hf_cache:/mnt/hf_cache
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/ --privileged
    defaults:
      run:
        shell: bash
@@ -357,10 +356,6 @@ jobs:
        config:
          - backend: "bitsandbytes"
            test_location: "bnb"
-          - backend: "gguf"
-            test_location: "gguf"
-          - backend: "torchao"
-            test_location: "torchao"
    runs-on:
      group: aws-g6e-xlarge-plus
    container:
@@ -524,4 +519,4 @@ jobs:
 #        if: always()
 #        run: |
 #          pip install slack_sdk tabulate
-#          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
+#          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
@@ -0,0 +1,134 @@
+name: Fast tests for PRs - PEFT backend
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "src/diffusers/**.py"
+      - "tests/**.py"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+env:
+  DIFFUSERS_IS_CI: yes
+  OMP_NUM_THREADS: 4
+  MKL_NUM_THREADS: 4
+  PYTEST_TIMEOUT: 60
+
+jobs:
+  check_code_quality:
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[quality]
+      - name: Check quality
+        run: make quality
+      - name: Check if failure
+        if: ${{ failure() }}
+        run: |
+          echo "Quality check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make style && make quality'" >> $GITHUB_STEP_SUMMARY
+
+  check_repository_consistency:
+    needs: check_code_quality
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.8"
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install .[quality]
+      - name: Check repo consistency
+        run: |
+          python utils/check_copies.py
+          python utils/check_dummies.py
+          make deps_table_check_updated
+      - name: Check if failure
+        if: ${{ failure() }}
+        run: |
+          echo "Repo consistency check failed. Please ensure the right dependency versions are installed with 'pip install -e .[quality]' and run 'make fix-copies'" >> $GITHUB_STEP_SUMMARY
+
+  run_fast_tests:
+    needs: [check_code_quality, check_repository_consistency]
+    strategy:
+      fail-fast: false
+      matrix:
+        lib-versions: ["main", "latest"]
+
+
+    name: LoRA - ${{ matrix.lib-versions }}
+
+    runs-on:
+      group: aws-general-8-plus
+
+    container:
+      image: diffusers/diffusers-pytorch-cpu
+      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
+
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+    - name: Checkout diffusers
+      uses: actions/checkout@v3
+      with:
+        fetch-depth: 2
+
+    - name: Install dependencies
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m uv pip install -e [quality,test]
+        # TODO (sayakpaul, DN6): revisit `--no-deps`
+        if [ "${{ matrix.lib-versions }}" == "main" ]; then
+            python -m pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
+            python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
+            pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+        else
+            python -m uv pip install -U peft --no-deps
+            python -m uv pip install -U transformers accelerate --no-deps
+        fi
+
+    - name: Environment
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python utils/print_env.py
+
+    - name: Run fast PyTorch LoRA CPU tests with PEFT backend
+      run: |
+        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+          -s -v \
+          --make-reports=tests_${{ matrix.lib-versions }} \
+          tests/lora/
+        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
+          -s -v \
+          --make-reports=tests_models_lora_${{ matrix.lib-versions }} \
+          tests/models/ -k "lora"
+
+
+    - name: Failure short reports
+      if: ${{ failure() }}
+      run: |
+        cat reports/tests_${{ matrix.lib-versions }}_failures_short.txt
+        cat reports/tests_models_lora_${{ matrix.lib-versions }}_failures_short.txt
+
+    - name: Test suite reports artifacts
+      if: ${{ always() }}
+      uses: actions/upload-artifact@v4
+      with:
+        name: pr_${{ matrix.lib-versions }}_test_reports
+        path: reports
@@ -234,67 +234,3 @@ jobs:
      with:
        name: pr_${{ matrix.config.report }}_test_reports
        path: reports
-
-  run_lora_tests:
-    needs: [check_code_quality, check_repository_consistency]
-    strategy:
-      fail-fast: false
-
-    name: LoRA tests with PEFT main
-
-    runs-on:
-      group: aws-general-8-plus
-
-    container:
-      image: diffusers/diffusers-pytorch-cpu
-      options: --shm-size "16gb" --ipc host -v /mnt/hf_cache:/mnt/cache/
-
-    defaults:
-      run:
-        shell: bash
-
-    steps:
-    - name: Checkout diffusers
-      uses: actions/checkout@v3
-      with:
-        fetch-depth: 2
-
-    - name: Install dependencies
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m uv pip install -e [quality,test]
-        # TODO (sayakpaul, DN6): revisit `--no-deps`
-        python -m pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
-        python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-        pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
-
-    - name: Environment
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python utils/print_env.py
-
-    - name: Run fast PyTorch LoRA tests with PEFT
-      run: |
-        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
-          -s -v \
-          --make-reports=tests_peft_main \
-          tests/lora/
-        python -m pytest -n 4 --max-worker-restart=0 --dist=loadfile \
-          -s -v \
-          --make-reports=tests_models_lora_peft_main \
-          tests/models/ -k "lora"
-
-    - name: Failure short reports
-      if: ${{ failure() }}
-      run: |
-        cat reports/tests_lora_failures_short.txt
-        cat reports/tests_models_lora_failures_short.txt
-
-    - name: Test suite reports artifacts
-      if: ${{ always() }}
-      uses: actions/upload-artifact@v4
-      with:
-        name: pr_main_test_reports
-        path: reports
-
@@ -161,11 +161,10 @@ jobs:

  flax_tpu_tests:
    name: Flax TPU Tests
-    runs-on:
-      group: gcp-ct5lp-hightpu-8t
+    runs-on: docker-tpu
    container:
      image: diffusers/diffusers-flax-tpu
-      options: --shm-size "16gb" --ipc host --privileged ${{ vars.V5_LITEPOD_8_ENV}} -v /mnt/hf_cache:/mnt/hf_cache 
+      options: --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/ --privileged
    defaults:
      run:
        shell: bash
@@ -46,7 +46,7 @@ jobs:
      shell: arch -arch arm64 bash {0}
      run: |
        ${CONDA_RUN} python -m pip install --upgrade pip uv
-        ${CONDA_RUN} python -m uv pip install -e ".[quality,test]"
+        ${CONDA_RUN} python -m uv pip install -e [quality,test]
        ${CONDA_RUN} python -m uv pip install torch torchvision torchaudio
        ${CONDA_RUN} python -m uv pip install accelerate@git+https://github.com/huggingface/accelerate.git
        ${CONDA_RUN} python -m uv pip install transformers --upgrade
@@ -68,7 +68,7 @@ jobs:
      - name: Test installing diffusers and importing
        run: |
          pip install diffusers && pip uninstall diffusers -y
-          pip install -i https://test.pypi.org/simple/ diffusers
+          pip install -i https://testpypi.python.org/pypi diffusers
          python -c "from diffusers import __version__; print(__version__)"
          python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('fusing/unet-ldm-dummy-update'); pipe()"
          python -c "from diffusers import DiffusionPipeline; pipe = DiffusionPipeline.from_pretrained('hf-internal-testing/tiny-stable-diffusion-pipe', safety_checker=None); pipe('ah suh du')"
@@ -112,8 +112,8 @@ Check out the [Quickstart](https://huggingface.co/docs/diffusers/quicktour) to l
 | **Documentation**                                                   | **What can I learn?**                                                                                                                                                                           |
 |---------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
 | [Tutorial](https://huggingface.co/docs/diffusers/tutorials/tutorial_overview)                                                            | A basic crash course for learning how to use the library's most important features like using models and schedulers to build your own diffusion system, and training your own diffusion model.  |
-| [Loading](https://huggingface.co/docs/diffusers/using-diffusers/loading)                                                             | Guides for how to load and configure all the components (pipelines, models, and schedulers) of the library, as well as how to use different schedulers.                                         |
-| [Pipelines for inference](https://huggingface.co/docs/diffusers/using-diffusers/overview_techniques)                                             | Guides for how to use pipelines for different inference tasks, batched generation, controlling generated outputs and randomness, and how to contribute a pipeline to the library.               |
+| [Loading](https://huggingface.co/docs/diffusers/using-diffusers/loading_overview)                                                             | Guides for how to load and configure all the components (pipelines, models, and schedulers) of the library, as well as how to use different schedulers.                                         |
+| [Pipelines for inference](https://huggingface.co/docs/diffusers/using-diffusers/pipeline_overview)                                             | Guides for how to use pipelines for different inference tasks, batched generation, controlling generated outputs and randomness, and how to contribute a pipeline to the library.               |
 | [Optimization](https://huggingface.co/docs/diffusers/optimization/fp16)                                                        | Guides for how to optimize your diffusion model to run faster and consume less memory.                                                                                                          |
 | [Training](https://huggingface.co/docs/diffusers/training/overview) | Guides for how to train a diffusion model for different tasks with different training techniques.                                                                                               |
 ## Contribution
@@ -157,10 +157,6 @@
    title: Getting Started
  - local: quantization/bitsandbytes
    title: bitsandbytes
-  - local: quantization/gguf
-    title: gguf
-  - local: quantization/torchao
-    title: torchao
  title: Quantization Methods
 - sections:
  - local: optimization/fp16
@@ -238,8 +234,6 @@
      title: Textual Inversion
    - local: api/loaders/unet
      title: UNet
-    - local: api/loaders/transformer_sd3
-      title: SD3Transformer2D
    - local: api/loaders/peft
      title: PEFT
    title: Loaders
@@ -258,8 +252,6 @@
        title: SD3ControlNetModel
      - local: api/models/controlnet_sparsectrl
        title: SparseControlNetModel
-      - local: api/models/controlnet_union
-        title: ControlNetUnionModel
      title: ControlNets
    - sections:
      - local: api/models/allegro_transformer3d
@@ -276,14 +268,10 @@
        title: FluxTransformer2DModel
      - local: api/models/hunyuan_transformer2d
        title: HunyuanDiT2DModel
-      - local: api/models/hunyuan_video_transformer_3d
-        title: HunyuanVideoTransformer3DModel
      - local: api/models/latte_transformer3d
        title: LatteTransformer3DModel
      - local: api/models/lumina_nextdit2d
        title: LuminaNextDiT2DModel
-      - local: api/models/ltx_video_transformer3d
-        title: LTXVideoTransformer3DModel
      - local: api/models/mochi_transformer3d
        title: MochiTransformer3DModel
      - local: api/models/pixart_transformer2d
@@ -292,8 +280,6 @@
        title: PriorTransformer
      - local: api/models/sd3_transformer2d
        title: SD3Transformer2DModel
-      - local: api/models/sana_transformer2d
-        title: SanaTransformer2DModel
      - local: api/models/stable_audio_transformer
        title: StableAudioDiTModel
      - local: api/models/transformer2d
@@ -324,16 +310,10 @@
        title: AutoencoderKLAllegro
      - local: api/models/autoencoderkl_cogvideox
        title: AutoencoderKLCogVideoX
-      - local: api/models/autoencoder_kl_hunyuan_video
-        title: AutoencoderKLHunyuanVideo
-      - local: api/models/autoencoderkl_ltx_video
-        title: AutoencoderKLLTXVideo
      - local: api/models/autoencoderkl_mochi
        title: AutoencoderKLMochi
      - local: api/models/asymmetricautoencoderkl
        title: AsymmetricAutoencoderKL
-      - local: api/models/autoencoder_dc
-        title: AutoencoderDC
      - local: api/models/consistency_decoder_vae
        title: ConsistencyDecoderVAE
      - local: api/models/autoencoder_oobleck
@@ -386,8 +366,6 @@
      title: ControlNet-XS
    - local: api/pipelines/controlnetxs_sdxl
      title: ControlNet-XS with Stable Diffusion XL
-    - local: api/pipelines/controlnet_union
-      title: ControlNetUnion
    - local: api/pipelines/dance_diffusion
      title: Dance Diffusion
    - local: api/pipelines/ddim
@@ -402,12 +380,8 @@
      title: DiT
    - local: api/pipelines/flux
      title: Flux
-    - local: api/pipelines/control_flux_inpaint
-      title: FluxControlInpaint
    - local: api/pipelines/hunyuandit
      title: Hunyuan-DiT
-    - local: api/pipelines/hunyuan_video
-      title: HunyuanVideo
    - local: api/pipelines/i2vgenxl
      title: I2VGen-XL
    - local: api/pipelines/pix2pix
@@ -428,8 +402,6 @@
      title: Latte
    - local: api/pipelines/ledits_pp
      title: LEDITS++
-    - local: api/pipelines/ltx_video
-      title: LTXVideo
    - local: api/pipelines/lumina
      title: Lumina-T2X
    - local: api/pipelines/marigold
@@ -450,8 +422,6 @@
      title: PixArt-α
    - local: api/pipelines/pixart_sigma
      title: PixArt-Σ
-    - local: api/pipelines/sana
-      title: Sana
    - local: api/pipelines/self_attention_guidance
      title: Self-Attention Guidance
    - local: api/pipelines/semantic_stable_diffusion
@@ -15,135 +15,40 @@ specific language governing permissions and limitations under the License.
 An attention processor is a class for applying different types of attention mechanisms.

 ## AttnProcessor
-
 [[autodoc]] models.attention_processor.AttnProcessor

+## AttnProcessor2_0
 [[autodoc]] models.attention_processor.AttnProcessor2_0

+## AttnAddedKVProcessor
 [[autodoc]] models.attention_processor.AttnAddedKVProcessor

+## AttnAddedKVProcessor2_0
 [[autodoc]] models.attention_processor.AttnAddedKVProcessor2_0

-[[autodoc]] models.attention_processor.AttnProcessorNPU
-
-[[autodoc]] models.attention_processor.FusedAttnProcessor2_0
-
-## Allegro
-
-[[autodoc]] models.attention_processor.AllegroAttnProcessor2_0
-
-## AuraFlow
-
-[[autodoc]] models.attention_processor.AuraFlowAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.FusedAuraFlowAttnProcessor2_0
-
-## CogVideoX
-
-[[autodoc]] models.attention_processor.CogVideoXAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.FusedCogVideoXAttnProcessor2_0
-
 ## CrossFrameAttnProcessor
-
 [[autodoc]] pipelines.text_to_video_synthesis.pipeline_text_to_video_zero.CrossFrameAttnProcessor

-## Custom Diffusion
-
+## CustomDiffusionAttnProcessor
 [[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor

+## CustomDiffusionAttnProcessor2_0
 [[autodoc]] models.attention_processor.CustomDiffusionAttnProcessor2_0

+## CustomDiffusionXFormersAttnProcessor
 [[autodoc]] models.attention_processor.CustomDiffusionXFormersAttnProcessor

-## Flux
-
-[[autodoc]] models.attention_processor.FluxAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.FusedFluxAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.FluxSingleAttnProcessor2_0
-
-## Hunyuan
-
-[[autodoc]] models.attention_processor.HunyuanAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.FusedHunyuanAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.PAGHunyuanAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.PAGCFGHunyuanAttnProcessor2_0
-
-## IdentitySelfAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.PAGIdentitySelfAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.PAGCFGIdentitySelfAttnProcessor2_0
-
-## IP-Adapter
-
-[[autodoc]] models.attention_processor.IPAdapterAttnProcessor
-
-[[autodoc]] models.attention_processor.IPAdapterAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.SD3IPAdapterJointAttnProcessor2_0
-
-## JointAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.JointAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.PAGJointAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.PAGCFGJointAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.FusedJointAttnProcessor2_0
-
-## LoRA
-
-[[autodoc]] models.attention_processor.LoRAAttnProcessor
-
-[[autodoc]] models.attention_processor.LoRAAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.LoRAAttnAddedKVProcessor
-
-[[autodoc]] models.attention_processor.LoRAXFormersAttnProcessor
-
-## Lumina-T2X
-
-[[autodoc]] models.attention_processor.LuminaAttnProcessor2_0
-
-## Mochi
-
-[[autodoc]] models.attention_processor.MochiAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.MochiVaeAttnProcessor2_0
-
-## Sana
-
-[[autodoc]] models.attention_processor.SanaLinearAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.SanaMultiscaleAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.PAGCFGSanaLinearAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.PAGIdentitySanaLinearAttnProcessor2_0
-
-## Stable Audio
-
-[[autodoc]] models.attention_processor.StableAudioAttnProcessor2_0
+## FusedAttnProcessor2_0
+[[autodoc]] models.attention_processor.FusedAttnProcessor2_0

 ## SlicedAttnProcessor
-
 [[autodoc]] models.attention_processor.SlicedAttnProcessor

+## SlicedAttnAddedKVProcessor
 [[autodoc]] models.attention_processor.SlicedAttnAddedKVProcessor

 ## XFormersAttnProcessor
-
 [[autodoc]] models.attention_processor.XFormersAttnProcessor

-[[autodoc]] models.attention_processor.XFormersAttnAddedKVProcessor
-
-## XLAFlashAttnProcessor2_0
-
-[[autodoc]] models.attention_processor.XLAFlashAttnProcessor2_0
+## AttnProcessorNPU
+[[autodoc]] models.attention_processor.AttnProcessorNPU
@@ -24,12 +24,6 @@ Learn how to load an IP-Adapter checkpoint and image in the IP-Adapter [loading]

 [[autodoc]] loaders.ip_adapter.IPAdapterMixin

-## SD3IPAdapterMixin
-
-[[autodoc]] loaders.ip_adapter.SD3IPAdapterMixin
-    - all
-    - is_ip_adapter_active
-
 ## IPAdapterMaskProcessor

 [[autodoc]] image_processor.IPAdapterMaskProcessor
@@ -17,9 +17,6 @@ LoRA is a fast and lightweight training method that inserts and trains a signifi
 - [`StableDiffusionLoraLoaderMixin`] provides functions for loading and unloading, fusing and unfusing, enabling and disabling, and more functions for managing LoRA weights. This class can be used with any model.
 - [`StableDiffusionXLLoraLoaderMixin`] is a [Stable Diffusion (SDXL)](../../api/pipelines/stable_diffusion/stable_diffusion_xl) version of the [`StableDiffusionLoraLoaderMixin`] class for loading and saving LoRA weights. It can only be used with the SDXL model.
 - [`SD3LoraLoaderMixin`] provides similar functions for [Stable Diffusion 3](https://huggingface.co/blog/sd3).
- [`FluxLoraLoaderMixin`] provides similar functions for [Flux](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux).
- [`CogVideoXLoraLoaderMixin`] provides similar functions for [CogVideoX](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cogvideox).
- [`Mochi1LoraLoaderMixin`] provides similar functions for [Mochi](https://huggingface.co/docs/diffusers/main/en/api/pipelines/mochi).
 - [`AmusedLoraLoaderMixin`] is for the [`AmusedPipeline`].
 - [`LoraBaseMixin`] provides a base class with several utility methods to fuse, unfuse, unload, LoRAs and more.

@@ -41,18 +38,6 @@ To learn more about how to load LoRA weights, see the [LoRA](../../using-diffuse

 [[autodoc]] loaders.lora_pipeline.SD3LoraLoaderMixin

-## FluxLoraLoaderMixin
-
-[[autodoc]] loaders.lora_pipeline.FluxLoraLoaderMixin
-
-## CogVideoXLoraLoaderMixin
-
-[[autodoc]] loaders.lora_pipeline.CogVideoXLoraLoaderMixin
-
-## Mochi1LoraLoaderMixin
-
-[[autodoc]] loaders.lora_pipeline.Mochi1LoraLoaderMixin
-
 ## AmusedLoraLoaderMixin

 [[autodoc]] loaders.lora_pipeline.AmusedLoraLoaderMixin
@@ -1,29 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# SD3Transformer2D
-
-This class is useful when *only* loading weights into a [`SD3Transformer2DModel`]. If you need to load weights into the text encoder or a text encoder and SD3Transformer2DModel, check [`SD3LoraLoaderMixin`](lora#diffusers.loaders.SD3LoraLoaderMixin) class instead.
-
-The [`SD3Transformer2DLoadersMixin`] class currently only loads IP-Adapter weights, but will be used in the future to save weights and load LoRAs.
-
-<Tip>
-
-To learn more about how to load LoRA weights, see the [LoRA](../../using-diffusers/loading_adapters#lora) loading guide.
-
-</Tip>
-
-## SD3Transformer2DLoadersMixin
-
-[[autodoc]] loaders.transformer_sd3.SD3Transformer2DLoadersMixin
-    - all
-    - _load_ip_adapter_weights
@@ -1,72 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# AutoencoderDC
-
-The 2D Autoencoder model used in [SANA](https://huggingface.co/papers/2410.10629) and introduced in [DCAE](https://huggingface.co/papers/2410.10733) by authors Junyu Chen\*, Han Cai\*, Junsong Chen, Enze Xie, Shang Yang, Haotian Tang, Muyang Li, Yao Lu, Song Han from MIT HAN Lab.
-
-The abstract from the paper is:
-
-*We present Deep Compression Autoencoder (DC-AE), a new family of autoencoder models for accelerating high-resolution diffusion models. Existing autoencoder models have demonstrated impressive results at a moderate spatial compression ratio (e.g., 8x), but fail to maintain satisfactory reconstruction accuracy for high spatial compression ratios (e.g., 64x). We address this challenge by introducing two key techniques: (1) Residual Autoencoding, where we design our models to learn residuals based on the space-to-channel transformed features to alleviate the optimization difficulty of high spatial-compression autoencoders; (2) Decoupled High-Resolution Adaptation, an efficient decoupled three-phases training strategy for mitigating the generalization penalty of high spatial-compression autoencoders. With these designs, we improve the autoencoder's spatial compression ratio up to 128 while maintaining the reconstruction quality. Applying our DC-AE to latent diffusion models, we achieve significant speedup without accuracy drop. For example, on ImageNet 512x512, our DC-AE provides 19.1x inference speedup and 17.9x training speedup on H100 GPU for UViT-H while achieving a better FID, compared with the widely used SD-VAE-f8 autoencoder. Our code is available at [this https URL](https://github.com/mit-han-lab/efficientvit).*
-
-The following DCAE models are released and supported in Diffusers.
-
-| Diffusers format | Original format |
-|:----------------:|:---------------:|
-| [`mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers`](https://huggingface.co/mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers) | [`mit-han-lab/dc-ae-f32c32-sana-1.0`](https://huggingface.co/mit-han-lab/dc-ae-f32c32-sana-1.0)
-| [`mit-han-lab/dc-ae-f32c32-in-1.0-diffusers`](https://huggingface.co/mit-han-lab/dc-ae-f32c32-in-1.0-diffusers) | [`mit-han-lab/dc-ae-f32c32-in-1.0`](https://huggingface.co/mit-han-lab/dc-ae-f32c32-in-1.0)
-| [`mit-han-lab/dc-ae-f32c32-mix-1.0-diffusers`](https://huggingface.co/mit-han-lab/dc-ae-f32c32-mix-1.0-diffusers) | [`mit-han-lab/dc-ae-f32c32-mix-1.0`](https://huggingface.co/mit-han-lab/dc-ae-f32c32-mix-1.0)
-| [`mit-han-lab/dc-ae-f64c128-in-1.0-diffusers`](https://huggingface.co/mit-han-lab/dc-ae-f64c128-in-1.0-diffusers) | [`mit-han-lab/dc-ae-f64c128-in-1.0`](https://huggingface.co/mit-han-lab/dc-ae-f64c128-in-1.0)
-| [`mit-han-lab/dc-ae-f64c128-mix-1.0-diffusers`](https://huggingface.co/mit-han-lab/dc-ae-f64c128-mix-1.0-diffusers) | [`mit-han-lab/dc-ae-f64c128-mix-1.0`](https://huggingface.co/mit-han-lab/dc-ae-f64c128-mix-1.0)
-| [`mit-han-lab/dc-ae-f128c512-in-1.0-diffusers`](https://huggingface.co/mit-han-lab/dc-ae-f128c512-in-1.0-diffusers) | [`mit-han-lab/dc-ae-f128c512-in-1.0`](https://huggingface.co/mit-han-lab/dc-ae-f128c512-in-1.0)
-| [`mit-han-lab/dc-ae-f128c512-mix-1.0-diffusers`](https://huggingface.co/mit-han-lab/dc-ae-f128c512-mix-1.0-diffusers) | [`mit-han-lab/dc-ae-f128c512-mix-1.0`](https://huggingface.co/mit-han-lab/dc-ae-f128c512-mix-1.0)
-
-This model was contributed by [lawrence-cj](https://github.com/lawrence-cj).
-
-Load a model in Diffusers format with [`~ModelMixin.from_pretrained`].
-
-```python
-from diffusers import AutoencoderDC
-
-ae = AutoencoderDC.from_pretrained("mit-han-lab/dc-ae-f32c32-sana-1.0-diffusers", torch_dtype=torch.float32).to("cuda")
-```
-
-## Load a model in Diffusers via `from_single_file`
-
-```python
-from difusers import AutoencoderDC
-
-ckpt_path = "https://huggingface.co/mit-han-lab/dc-ae-f32c32-sana-1.0/blob/main/model.safetensors"
-model = AutoencoderDC.from_single_file(ckpt_path) 
-
-```
-
-The `AutoencoderDC` model has `in` and `mix` single file checkpoint variants that have matching checkpoint keys, but use different scaling factors. It is not possible for Diffusers to automatically infer the correct config file to use with the model based on just the checkpoint and will default to configuring the model using the `mix` variant config file. To override the automatically determined config, please use the `config` argument when using single file loading with `in` variant checkpoints. 
-
-```python
-from diffusers import AutoencoderDC
-
-ckpt_path = "https://huggingface.co/mit-han-lab/dc-ae-f128c512-in-1.0/blob/main/model.safetensors"
-model = AutoencoderDC.from_single_file(ckpt_path, config="mit-han-lab/dc-ae-f128c512-in-1.0-diffusers")
-```
-
-
-## AutoencoderDC
-
-[[autodoc]] AutoencoderDC
-  - encode
-  - decode
-  - all
-
-## DecoderOutput
-
-[[autodoc]] models.autoencoders.vae.DecoderOutput
-
@@ -1,32 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# AutoencoderKLHunyuanVideo
-
-The 3D variational autoencoder (VAE) model with KL loss used in [HunyuanVideo](https://github.com/Tencent/HunyuanVideo/), which was introduced in [HunyuanVideo: A Systematic Framework For Large Video Generative Models](https://huggingface.co/papers/2412.03603) by Tencent.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import AutoencoderKLHunyuanVideo
-
-vae = AutoencoderKLHunyuanVideo.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="vae", torch_dtype=torch.float16)
-```
-
-## AutoencoderKLHunyuanVideo
-
-[[autodoc]] AutoencoderKLHunyuanVideo
-  - decode
-  - all
-
-## DecoderOutput
-
-[[autodoc]] models.autoencoders.vae.DecoderOutput
@@ -1,37 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# AutoencoderKLLTXVideo
-
-The 3D variational autoencoder (VAE) model with KL loss used in [LTX](https://huggingface.co/Lightricks/LTX-Video) was introduced by Lightricks.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import AutoencoderKLLTXVideo
-
-vae = AutoencoderKLLTXVideo.from_pretrained("Lightricks/LTX-Video", subfolder="vae", torch_dtype=torch.float32).to("cuda")
-```
-
-## AutoencoderKLLTXVideo
-
-[[autodoc]] AutoencoderKLLTXVideo
-    - decode
-    - encode
-    - all
-
-## AutoencoderKLOutput
-
-[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
-
-## DecoderOutput
-
-[[autodoc]] models.autoencoders.vae.DecoderOutput
@@ -1,35 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team and The InstantX Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# ControlNetUnionModel
-
-ControlNetUnionModel is an implementation of ControlNet for Stable Diffusion XL.
-
-The ControlNet model was introduced in [ControlNetPlus](https://github.com/xinsir6/ControlNetPlus) by xinsir6. It supports multiple conditioning inputs without increasing computation.
-
-*We design a new architecture that can support 10+ control types in condition text-to-image generation and can generate high resolution images visually comparable with midjourney. The network is based on the original ControlNet architecture, we propose two new modules to: 1 Extend the original ControlNet to support different image conditions using the same network parameter. 2 Support multiple conditions input without increasing computation offload, which is especially important for designers who want to edit image in detail, different conditions use the same condition encoder, without adding extra computations or parameters.*
-
-## Loading
-
-By default the [`ControlNetUnionModel`] should be loaded with [`~ModelMixin.from_pretrained`].
-
-```py
-from diffusers import StableDiffusionXLControlNetUnionPipeline, ControlNetUnionModel
-
-controlnet = ControlNetUnionModel.from_pretrained("xinsir/controlnet-union-sdxl-1.0")
-pipe = StableDiffusionXLControlNetUnionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet)
-```
-
-## ControlNetUnionModel
-
-[[autodoc]] ControlNetUnionModel
-
@@ -1,30 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# HunyuanVideoTransformer3DModel
-
-A Diffusion Transformer model for 3D video-like data was introduced in [HunyuanVideo: A Systematic Framework For Large Video Generative Models](https://huggingface.co/papers/2412.03603) by Tencent.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import HunyuanVideoTransformer3DModel
-
-transformer = HunyuanVideoTransformer3DModel.from_pretrained("hunyuanvideo-community/HunyuanVideo", subfolder="transformer", torch_dtype=torch.bfloat16)
-```
-
-## HunyuanVideoTransformer3DModel
-
-[[autodoc]] HunyuanVideoTransformer3DModel
-
-## Transformer2DModelOutput
-
-[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
@@ -1,30 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# LTXVideoTransformer3DModel
-
-A Diffusion Transformer model for 3D data from [LTX](https://huggingface.co/Lightricks/LTX-Video) was introduced by Lightricks.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import LTXVideoTransformer3DModel
-
-transformer = LTXVideoTransformer3DModel.from_pretrained("Lightricks/LTX-Video", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
-```
-
-## LTXVideoTransformer3DModel
-
-[[autodoc]] LTXVideoTransformer3DModel
-
-## Transformer2DModelOutput
-
-[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
@@ -1,34 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# SanaTransformer2DModel
-
-A Diffusion Transformer model for 2D data from [SANA: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformers](https://huggingface.co/papers/2410.10629) was introduced from NVIDIA and MIT HAN Lab, by Enze Xie, Junsong Chen, Junyu Chen, Han Cai, Haotian Tang, Yujun Lin, Zhekai Zhang, Muyang Li, Ligeng Zhu, Yao Lu, Song Han.
-
-The abstract from the paper is:
-
-*We introduce Sana, a text-to-image framework that can efficiently generate images up to 4096×4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU. Core designs include: (1) Deep compression autoencoder: unlike traditional AEs, which compress images only 8×, we trained an AE that can compress images 32×, effectively reducing the number of latent tokens. (2) Linear DiT: we replace all vanilla attention in DiT with linear attention, which is more efficient at high resolutions without sacrificing quality. (3) Decoder-only text encoder: we replaced T5 with modern decoder-only small LLM as the text encoder and designed complex human instruction with in-context learning to enhance the image-text alignment. (4) Efficient training and sampling: we propose Flow-DPM-Solver to reduce sampling steps, with efficient caption labeling and selection to accelerate convergence. As a result, Sana-0.6B is very competitive with modern giant diffusion model (e.g. Flux-12B), being 20 times smaller and 100+ times faster in measured throughput. Moreover, Sana-0.6B can be deployed on a 16GB laptop GPU, taking less than 1 second to generate a 1024×1024 resolution image. Sana enables content creation at low cost. Code and model will be publicly released.*
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import SanaTransformer2DModel
-
-transformer = SanaTransformer2DModel.from_pretrained("Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
-```
-
-## SanaTransformer2DModel
-
-[[autodoc]] SanaTransformer2DModel
-
-## Transformer2DModelOutput
-
-[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
@@ -1,89 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team, The Black Forest Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# FluxControlInpaint
-
-FluxControlInpaintPipeline is an implementation of Inpainting for Flux.1 Depth/Canny models. It is a pipeline that allows you to inpaint images using the Flux.1 Depth/Canny models. The pipeline takes an image and a mask as input and returns the inpainted image.
-
-FLUX.1 Depth and Canny [dev] is a 12 billion parameter rectified flow transformer capable of generating an image based on a text description while following the structure of a given input image. **This is not a ControlNet model**.
-
-| Control type | Developer | Link |
-| -------- | ---------- | ---- |
-| Depth | [Black Forest Labs](https://huggingface.co/black-forest-labs) | [Link](https://huggingface.co/black-forest-labs/FLUX.1-Depth-dev) |
-| Canny | [Black Forest Labs](https://huggingface.co/black-forest-labs) | [Link](https://huggingface.co/black-forest-labs/FLUX.1-Canny-dev) |
-
-
-<Tip>
-
-Flux can be quite expensive to run on consumer hardware devices. However, you can perform a suite of optimizations to run it faster and in a more memory-friendly manner. Check out [this section](https://huggingface.co/blog/sd3#memory-optimizations-for-sd3) for more details. Additionally, Flux can benefit from quantization for memory efficiency with a trade-off in inference latency. Refer to [this blog post](https://huggingface.co/blog/quanto-diffusers) to learn more. For an exhaustive list of resources, check out [this gist](https://gist.github.com/sayakpaul/b664605caf0aa3bf8585ab109dd5ac9c).
-
-</Tip>
-
-```python
-import torch
-from diffusers import FluxControlInpaintPipeline
-from diffusers.models.transformers import FluxTransformer2DModel
-from transformers import T5EncoderModel
-from diffusers.utils import load_image, make_image_grid
-from image_gen_aux import DepthPreprocessor # https://github.com/huggingface/image_gen_aux
-from PIL import Image
-import numpy as np
-
-pipe = FluxControlInpaintPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-Depth-dev",
-    torch_dtype=torch.bfloat16,
-)
-# use following lines if you have GPU constraints
-# ---------------------------------------------------------------
-transformer = FluxTransformer2DModel.from_pretrained(
-    "sayakpaul/FLUX.1-Depth-dev-nf4", subfolder="transformer", torch_dtype=torch.bfloat16
-)
-text_encoder_2 = T5EncoderModel.from_pretrained(
-    "sayakpaul/FLUX.1-Depth-dev-nf4", subfolder="text_encoder_2", torch_dtype=torch.bfloat16
-)
-pipe.transformer = transformer
-pipe.text_encoder_2 = text_encoder_2
-pipe.enable_model_cpu_offload()
-# ---------------------------------------------------------------
-pipe.to("cuda")
-
-prompt = "a blue robot singing opera with human-like expressions"
-image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")
-
-head_mask = np.zeros_like(image)
-head_mask[65:580,300:642] = 255
-mask_image = Image.fromarray(head_mask)
-
-processor = DepthPreprocessor.from_pretrained("LiheYoung/depth-anything-large-hf")
-control_image = processor(image)[0].convert("RGB")
-
-output = pipe(
-    prompt=prompt,
-    image=image,
-    control_image=control_image,
-    mask_image=mask_image,
-    num_inference_steps=30,
-    strength=0.9,
-    guidance_scale=10.0,
-    generator=torch.Generator().manual_seed(42),
-).images[0]
-make_image_grid([image, control_image, mask_image, output.resize(image.size)], rows=1, cols=4).save("output.png")
-```
-
-## FluxControlInpaintPipeline
-[[autodoc]] FluxControlInpaintPipeline
-	- all
-	- __call__
-
-
-## FluxPipelineOutput
-[[autodoc]] pipelines.flux.pipeline_output.FluxPipelineOutput
@@ -1,35 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# ControlNetUnion
-
-ControlNetUnionModel is an implementation of ControlNet for Stable Diffusion XL.
-
-The ControlNet model was introduced in [ControlNetPlus](https://github.com/xinsir6/ControlNetPlus) by xinsir6. It supports multiple conditioning inputs without increasing computation.
-
-*We design a new architecture that can support 10+ control types in condition text-to-image generation and can generate high resolution images visually comparable with midjourney. The network is based on the original ControlNet architecture, we propose two new modules to: 1 Extend the original ControlNet to support different image conditions using the same network parameter. 2 Support multiple conditions input without increasing computation offload, which is especially important for designers who want to edit image in detail, different conditions use the same condition encoder, without adding extra computations or parameters.*
-
-
-## StableDiffusionXLControlNetUnionPipeline
-[[autodoc]] StableDiffusionXLControlNetUnionPipeline
-	- all
-	- __call__
-
-## StableDiffusionXLControlNetUnionImg2ImgPipeline
-[[autodoc]] StableDiffusionXLControlNetUnionImg2ImgPipeline
-	- all
-	- __call__
-
-## StableDiffusionXLControlNetUnionInpaintPipeline
-[[autodoc]] StableDiffusionXLControlNetUnionInpaintPipeline
-	- all
-	- __call__
@@ -143,35 +143,6 @@ image = pipe(
 image.save("output.png")
 ```

-Canny Control is also possible with a LoRA variant of this condition. The usage is as follows:
-
-```python
-# !pip install -U controlnet-aux
-import torch
-from controlnet_aux import CannyDetector
-from diffusers import FluxControlPipeline
-from diffusers.utils import load_image
-
-pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to("cuda")
-pipe.load_lora_weights("black-forest-labs/FLUX.1-Canny-dev-lora")
-
-prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
-control_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")
-
-processor = CannyDetector()
-control_image = processor(control_image, low_threshold=50, high_threshold=200, detect_resolution=1024, image_resolution=1024)
-
-image = pipe(
-    prompt=prompt,
-    control_image=control_image,
-    height=1024,
-    width=1024,
-    num_inference_steps=50,
-    guidance_scale=30.0,
-).images[0]
-image.save("output.png")
-```
-
 ### Depth Control

 **Note:** `black-forest-labs/Flux.1-Depth-dev` is _not_ a ControlNet model. [`ControlNetModel`] models are a separate component from the UNet/Transformer whose residuals are added to the actual underlying model. Depth Control is an alternate architecture that achieves effectively the same results as a ControlNet model would, by using channel-wise concatenation with input control condition and ensuring the transformer learns structure control by following the condition as closely as possible.
@@ -203,36 +174,6 @@ image = pipe(
 image.save("output.png")
 ```

-Depth Control is also possible with a LoRA variant of this condition. The usage is as follows:
-
-```python
-# !pip install git+https://github.com/huggingface/image_gen_aux
-import torch
-from diffusers import FluxControlPipeline, FluxTransformer2DModel
-from diffusers.utils import load_image
-from image_gen_aux import DepthPreprocessor
-
-pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to("cuda")
-pipe.load_lora_weights("black-forest-labs/FLUX.1-Depth-dev-lora")
-
-prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
-control_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")
-
-processor = DepthPreprocessor.from_pretrained("LiheYoung/depth-anything-large-hf")
-control_image = processor(control_image)[0].convert("RGB")
-
-image = pipe(
-    prompt=prompt,
-    control_image=control_image,
-    height=1024,
-    width=1024,
-    num_inference_steps=30,
-    guidance_scale=10.0,
-    generator=torch.Generator().manual_seed(42),
-).images[0]
-image.save("output.png")
-```
-
 ### Redux

 * Flux Redux pipeline is an adapter for FLUX.1 base models. It can be used with both flux-dev and flux-schnell, for image-to-image generation.
@@ -268,43 +209,6 @@ images = pipe(
 images[0].save("flux-redux.png")
 ```

-## Combining Flux Turbo LoRAs with Flux Control, Fill, and Redux
-
-We can combine Flux Turbo LoRAs with Flux Control and other pipelines like Fill and Redux to enable few-steps' inference. The example below shows how to do that for Flux Control LoRA for depth and turbo LoRA from [`ByteDance/Hyper-SD`](https://hf.co/ByteDance/Hyper-SD).
-
-```py
-from diffusers import FluxControlPipeline
-from image_gen_aux import DepthPreprocessor
-from diffusers.utils import load_image
-from huggingface_hub import hf_hub_download
-import torch
-
-control_pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)
-control_pipe.load_lora_weights("black-forest-labs/FLUX.1-Depth-dev-lora", adapter_name="depth")
-control_pipe.load_lora_weights(
-    hf_hub_download("ByteDance/Hyper-SD", "Hyper-FLUX.1-dev-8steps-lora.safetensors"), adapter_name="hyper-sd"
-)
-control_pipe.set_adapters(["depth", "hyper-sd"], adapter_weights=[0.85, 0.125])
-control_pipe.enable_model_cpu_offload()
-
-prompt = "A robot made of exotic candies and chocolates of different kinds. The background is filled with confetti and celebratory gifts."
-control_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/robot.png")
-
-processor = DepthPreprocessor.from_pretrained("LiheYoung/depth-anything-large-hf")
-control_image = processor(control_image)[0].convert("RGB")
-
-image = control_pipe(
-    prompt=prompt,
-    control_image=control_image,
-    height=1024,
-    width=1024,
-    num_inference_steps=8,
-    guidance_scale=10.0,
-    generator=torch.Generator().manual_seed(42),
-).images[0]
-image.save("output.png")
-```
-
 ## Running FP16 inference

 Flux can generate high-quality images with FP16 (i.e. to accelerate inference on Turing/Volta GPUs) but produces different outputs compared to FP32/BF16. The issue is that some activations in the text encoders have to be clipped when running in FP16, which affects the overall image. Forcing text encoders to run with FP32 inference thus removes this output difference. See [here](https://github.com/huggingface/diffusers/pull/9097#issuecomment-2272292516) for details.
@@ -1,43 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License. -->
-
-# HunyuanVideo
-
-[HunyuanVideo](https://www.arxiv.org/abs/2412.03603) by Tencent.
-
-*Recent advancements in video generation have significantly impacted daily life for both individuals and industries. However, the leading video generation models remain closed-source, resulting in a notable performance gap between industry capabilities and those available to the public. In this report, we introduce HunyuanVideo, an innovative open-source video foundation model that demonstrates performance in video generation comparable to, or even surpassing, that of leading closed-source models. HunyuanVideo encompasses a comprehensive framework that integrates several key elements, including data curation, advanced architectural design, progressive model scaling and training, and an efficient infrastructure tailored for large-scale model training and inference. As a result, we successfully trained a video generative model with over 13 billion parameters, making it the largest among all open-source models. We conducted extensive experiments and implemented a series of targeted designs to ensure high visual quality, motion dynamics, text-video alignment, and advanced filming techniques. According to evaluations by professionals, HunyuanVideo outperforms previous state-of-the-art models, including Runway Gen-3, Luma 1.6, and three top-performing Chinese video generative models. By releasing the code for the foundation model and its applications, we aim to bridge the gap between closed-source and open-source communities. This initiative will empower individuals within the community to experiment with their ideas, fostering a more dynamic and vibrant video generation ecosystem. The code is publicly available at [this https URL](https://github.com/Tencent/HunyuanVideo).*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-Recommendations for inference:
- Both text encoders should be in `torch.float16`.
- Transformer should be in `torch.bfloat16`.
- VAE should be in `torch.float16`.
- `num_frames` should be of the form `4 * k + 1`, for example `49` or `129`.
- For smaller resolution videos, try lower values of `shift` (between `2.0` to `5.0`) in the [Scheduler](https://huggingface.co/docs/diffusers/main/en/api/schedulers/flow_match_euler_discrete#diffusers.FlowMatchEulerDiscreteScheduler.shift). For larger resolution images, try higher values (between `7.0` and `12.0`). The default value is `7.0` for HunyuanVideo.
- For more information about supported resolutions and other details, please refer to the original repository [here](https://github.com/Tencent/HunyuanVideo/).
-
-## HunyuanVideoPipeline
-
-[[autodoc]] HunyuanVideoPipeline
-  - all
-  - __call__
-
-## HunyuanVideoPipelineOutput
-
-[[autodoc]] pipelines.hunyuan_video.pipeline_output.HunyuanVideoPipelineOutput
@@ -1,156 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License. -->
-
-# LTX Video
-
-[LTX Video](https://huggingface.co/Lightricks/LTX-Video) is the first DiT-based video generation model capable of generating high-quality videos in real-time. It produces 24 FPS videos at a 768x512 resolution faster than they can be watched. Trained on a large-scale dataset of diverse videos, the model generates high-resolution videos with realistic and varied content. We provide a model for both text-to-video as well as image + text-to-video usecases.
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-Available models:
-
-|  Model name   | Recommended dtype |
-|:-------------:|:-----------------:|
-| [`LTX Video 0.9.0`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.safetensors) | `torch.bfloat16` |
-| [`LTX Video 0.9.1`](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.1.safetensors) | `torch.bfloat16` |
-
-Note: The recommended dtype is for the transformer component. The VAE and text encoders can be either `torch.float32`, `torch.bfloat16` or `torch.float16` but the recommended dtype is `torch.bfloat16` as used in the original repository.
-
-## Loading Single Files
-
-Loading the original LTX Video checkpoints is also possible with [`~ModelMixin.from_single_file`]. We recommend using `from_single_file` for the Lightricks series of models, as they plan to release multiple models in the future in the single file format.
-
-```python
-import torch
-from diffusers import AutoencoderKLLTXVideo, LTXImageToVideoPipeline, LTXVideoTransformer3DModel
-
-# `single_file_url` could also be https://huggingface.co/Lightricks/LTX-Video/ltx-video-2b-v0.9.1.safetensors
-single_file_url = "https://huggingface.co/Lightricks/LTX-Video/ltx-video-2b-v0.9.safetensors"
-transformer = LTXVideoTransformer3DModel.from_single_file(
-  single_file_url, torch_dtype=torch.bfloat16
-)
-vae = AutoencoderKLLTXVideo.from_single_file(single_file_url, torch_dtype=torch.bfloat16)
-pipe = LTXImageToVideoPipeline.from_pretrained(
-  "Lightricks/LTX-Video", transformer=transformer, vae=vae, torch_dtype=torch.bfloat16
-)
-
-# ... inference code ...
-```
-
-Alternatively, the pipeline can be used to load the weights with [`~FromSingleFileMixin.from_single_file`].
-
-```python
-import torch
-from diffusers import LTXImageToVideoPipeline
-from transformers import T5EncoderModel, T5Tokenizer
-
-single_file_url = "https://huggingface.co/Lightricks/LTX-Video/ltx-video-2b-v0.9.safetensors"
-text_encoder = T5EncoderModel.from_pretrained(
-  "Lightricks/LTX-Video", subfolder="text_encoder", torch_dtype=torch.bfloat16
-)
-tokenizer = T5Tokenizer.from_pretrained(
-  "Lightricks/LTX-Video", subfolder="tokenizer", torch_dtype=torch.bfloat16
-)
-pipe = LTXImageToVideoPipeline.from_single_file(
-  single_file_url, text_encoder=text_encoder, tokenizer=tokenizer, torch_dtype=torch.bfloat16
-)
-```
-
-Loading [LTX GGUF checkpoints](https://huggingface.co/city96/LTX-Video-gguf) are also supported:
-
-```py
-import torch
-from diffusers.utils import export_to_video
-from diffusers import LTXPipeline, LTXVideoTransformer3DModel, GGUFQuantizationConfig
-
-ckpt_path = (
-    "https://huggingface.co/city96/LTX-Video-gguf/blob/main/ltx-video-2b-v0.9-Q3_K_S.gguf"
-)
-transformer = LTXVideoTransformer3DModel.from_single_file(
-    ckpt_path,
-    quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
-    torch_dtype=torch.bfloat16,
-)
-pipe = LTXPipeline.from_pretrained(
-    "Lightricks/LTX-Video",
-    transformer=transformer,
-    torch_dtype=torch.bfloat16,
-)
-pipe.enable_model_cpu_offload()
-
-prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
-negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
-
-video = pipe(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    width=704,
-    height=480,
-    num_frames=161,
-    num_inference_steps=50,
-).frames[0]
-export_to_video(video, "output_gguf_ltx.mp4", fps=24)
-```
-
-Make sure to read the [documentation on GGUF](../../quantization/gguf) to learn more about our GGUF support.
-
-<!-- TODO(aryan): Update this when official weights are supported -->
-
-Loading and running inference with [LTX Video 0.9.1](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.1.safetensors) weights.
-
-```python
-import torch
-from diffusers import LTXPipeline
-from diffusers.utils import export_to_video
-
-pipe = LTXPipeline.from_pretrained("a-r-r-o-w/LTX-Video-0.9.1-diffusers", torch_dtype=torch.bfloat16)
-pipe.to("cuda")
-
-prompt = "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage"
-negative_prompt = "worst quality, inconsistent motion, blurry, jittery, distorted"
-
-video = pipe(
-    prompt=prompt,
-    negative_prompt=negative_prompt,
-    width=768,
-    height=512,
-    num_frames=161,
-    decode_timestep=0.03,
-    decode_noise_scale=0.025,
-    num_inference_steps=50,
-).frames[0]
-export_to_video(video, "output.mp4", fps=24)
-```
-
-Refer to [this section](https://huggingface.co/docs/diffusers/main/en/api/pipelines/cogvideox#memory-optimization) to learn more about optimizing memory consumption.
-
-## LTXPipeline
-
-[[autodoc]] LTXPipeline
-  - all
-  - __call__
-
-## LTXImageToVideoPipeline
-
-[[autodoc]] LTXImageToVideoPipeline
-  - all
-  - __call__
-
-## LTXPipelineOutput
-
-[[autodoc]] pipelines.ltx.pipeline_output.LTXPipelineOutput
@@ -13,7 +13,7 @@
 # limitations under the License.
 -->

-# Mochi 1 Preview
+# Mochi

 [Mochi 1 Preview](https://huggingface.co/genmo/mochi-1-preview) from Genmo.

@@ -25,201 +25,6 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.m

 </Tip>

-## Generating videos with Mochi-1 Preview
-
-The following example will download the full precision `mochi-1-preview` weights and produce the highest quality results but will require at least 42GB VRAM to run.
-
-```python
-import torch
-from diffusers import MochiPipeline
-from diffusers.utils import export_to_video
-
-pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview")
-
-# Enable memory savings
-pipe.enable_model_cpu_offload()
-pipe.enable_vae_tiling()
-
-prompt = "Close-up of a chameleon's eye, with its scaly skin changing color. Ultra high resolution 4k."
-
-with torch.autocast("cuda", torch.bfloat16, cache_enabled=False):
-      frames = pipe(prompt, num_frames=85).frames[0]
-
-export_to_video(frames, "mochi.mp4", fps=30)
-```
-
-## Using a lower precision variant to save memory
-
-The following example will use the `bfloat16` variant of the model and requires 22GB VRAM to run. There is a slight drop in the quality of the generated video as a result.
-
-```python
-import torch
-from diffusers import MochiPipeline
-from diffusers.utils import export_to_video
-
-pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview", variant="bf16", torch_dtype=torch.bfloat16)
-
-# Enable memory savings
-pipe.enable_model_cpu_offload()
-pipe.enable_vae_tiling()
-
-prompt = "Close-up of a chameleon's eye, with its scaly skin changing color. Ultra high resolution 4k."
-frames = pipe(prompt, num_frames=85).frames[0]
-
-export_to_video(frames, "mochi.mp4", fps=30)
-```
-
-## Reproducing the results from the Genmo Mochi repo
-
-The [Genmo Mochi implementation](https://github.com/genmoai/mochi/tree/main) uses different precision values for each stage in the inference process. The text encoder and VAE use `torch.float32`, while the DiT uses `torch.bfloat16` with the [attention kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html#torch.nn.attention.sdpa_kernel) set to `EFFICIENT_ATTENTION`. Diffusers pipelines currently do not support setting different `dtypes` for different stages of the pipeline. In order to run inference in the same way as the the original implementation, please refer to the following example.
-
-<Tip>
-The original Mochi implementation zeros out empty prompts. However, enabling this option and placing the entire pipeline under autocast can lead to numerical overflows with the T5 text encoder.
-
-When enabling `force_zeros_for_empty_prompt`, it is recommended to run the text encoding step outside the autocast context in full precision.
-</Tip>
-
-<Tip>
-Decoding the latents in full precision is very memory intensive. You will need at least 70GB VRAM to generate the 163 frames in this example. To reduce memory, either reduce the number of frames or run the decoding step in `torch.bfloat16`.
-</Tip>
-
-```python
-import torch
-from torch.nn.attention import SDPBackend, sdpa_kernel
-
-from diffusers import MochiPipeline
-from diffusers.utils import export_to_video
-from diffusers.video_processor import VideoProcessor
-
-pipe = MochiPipeline.from_pretrained("genmo/mochi-1-preview", force_zeros_for_empty_prompt=True)
-pipe.enable_vae_tiling()
-pipe.enable_model_cpu_offload()
-
-prompt =  "An aerial shot of a parade of elephants walking across the African savannah. The camera showcases the herd and the surrounding landscape."
-
-with torch.no_grad():
-    prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_prompt_attention_mask = (
-        pipe.encode_prompt(prompt=prompt)
-    )
-
-with torch.autocast("cuda", torch.bfloat16):
-    with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
-        frames = pipe(
-            prompt_embeds=prompt_embeds,
-            prompt_attention_mask=prompt_attention_mask,
-            negative_prompt_embeds=negative_prompt_embeds,
-            negative_prompt_attention_mask=negative_prompt_attention_mask,
-            guidance_scale=4.5,
-            num_inference_steps=64,
-            height=480,
-            width=848,
-            num_frames=163,
-            generator=torch.Generator("cuda").manual_seed(0),
-            output_type="latent",
-            return_dict=False,
-        )[0]
-
-video_processor = VideoProcessor(vae_scale_factor=8)
-has_latents_mean = hasattr(pipe.vae.config, "latents_mean") and pipe.vae.config.latents_mean is not None
-has_latents_std = hasattr(pipe.vae.config, "latents_std") and pipe.vae.config.latents_std is not None
-if has_latents_mean and has_latents_std:
-    latents_mean = (
-        torch.tensor(pipe.vae.config.latents_mean).view(1, 12, 1, 1, 1).to(frames.device, frames.dtype)
-    )
-    latents_std = (
-        torch.tensor(pipe.vae.config.latents_std).view(1, 12, 1, 1, 1).to(frames.device, frames.dtype)
-    )
-    frames = frames * latents_std / pipe.vae.config.scaling_factor + latents_mean
-else:
-    frames = frames / pipe.vae.config.scaling_factor
-
-with torch.no_grad():
-    video = pipe.vae.decode(frames.to(pipe.vae.dtype), return_dict=False)[0]
-
-video = video_processor.postprocess_video(video)[0]
-export_to_video(video, "mochi.mp4", fps=30)
-```
-
-## Running inference with multiple GPUs
-
-It is possible to split the large Mochi transformer across multiple GPUs using the `device_map` and `max_memory` options in `from_pretrained`. In the following example we split the model across two GPUs, each with 24GB of VRAM.
-
-```python
-import torch
-from diffusers import MochiPipeline, MochiTransformer3DModel
-from diffusers.utils import export_to_video
-
-model_id = "genmo/mochi-1-preview"
-transformer = MochiTransformer3DModel.from_pretrained(
-    model_id,
-    subfolder="transformer",
-    device_map="auto",
-    max_memory={0: "24GB", 1: "24GB"}
-)
-
-pipe = MochiPipeline.from_pretrained(model_id,  transformer=transformer)
-pipe.enable_model_cpu_offload()
-pipe.enable_vae_tiling()
-
-with torch.autocast(device_type="cuda", dtype=torch.bfloat16, cache_enabled=False):
-    frames = pipe(
-        prompt="Close-up of a chameleon's eye, with its scaly skin changing color. Ultra high resolution 4k.",
-        negative_prompt="",
-        height=480,
-        width=848,
-        num_frames=85,
-        num_inference_steps=50,
-        guidance_scale=4.5,
-        num_videos_per_prompt=1,
-        generator=torch.Generator(device="cuda").manual_seed(0),
-        max_sequence_length=256,
-        output_type="pil",
-    ).frames[0]
-
-export_to_video(frames, "output.mp4", fps=30)
-```
-
-## Using single file loading with the Mochi Transformer
-
-You can use `from_single_file` to load the Mochi transformer in its original format.
-
-<Tip>
-Diffusers currently doesn't support using the FP8 scaled versions of the Mochi single file checkpoints.
-</Tip>
-
-```python
-import torch
-from diffusers import MochiPipeline, MochiTransformer3DModel
-from diffusers.utils import export_to_video
-
-model_id = "genmo/mochi-1-preview"
-
-ckpt_path = "https://huggingface.co/Comfy-Org/mochi_preview_repackaged/blob/main/split_files/diffusion_models/mochi_preview_bf16.safetensors"
-
-transformer = MochiTransformer3DModel.from_pretrained(ckpt_path, torch_dtype=torch.bfloat16)
-
-pipe = MochiPipeline.from_pretrained(model_id,  transformer=transformer)
-pipe.enable_model_cpu_offload()
-pipe.enable_vae_tiling()
-
-with torch.autocast(device_type="cuda", dtype=torch.bfloat16, cache_enabled=False):
-    frames = pipe(
-        prompt="Close-up of a chameleon's eye, with its scaly skin changing color. Ultra high resolution 4k.",
-        negative_prompt="",
-        height=480,
-        width=848,
-        num_frames=85,
-        num_inference_steps=50,
-        guidance_scale=4.5,
-        num_videos_per_prompt=1,
-        generator=torch.Generator(device="cuda").manual_seed(0),
-        max_sequence_length=256,
-        output_type="pil",
-    ).frames[0]
-
-export_to_video(frames, "output.mp4", fps=30)
-```
-
 ## MochiPipeline

 [[autodoc]] MochiPipeline
@@ -48,11 +48,6 @@ Since RegEx is supported as a way for matching layer identifiers, it is crucial
  - all
  - __call__

-## StableDiffusionPAGInpaintPipeline
-[[autodoc]] StableDiffusionPAGInpaintPipeline
-	- all
-	- __call__
-
 ## StableDiffusionPAGPipeline
 [[autodoc]] StableDiffusionPAGPipeline
 	- all
@@ -1,67 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License. -->
-
-# SanaPipeline
-
-[SANA: Efficient High-Resolution Image Synthesis with Linear Diffusion Transformers](https://huggingface.co/papers/2410.10629) from NVIDIA and MIT HAN Lab, by Enze Xie, Junsong Chen, Junyu Chen, Han Cai, Haotian Tang, Yujun Lin, Zhekai Zhang, Muyang Li, Ligeng Zhu, Yao Lu, Song Han.
-
-The abstract from the paper is:
-
-*We introduce Sana, a text-to-image framework that can efficiently generate images up to 4096×4096 resolution. Sana can synthesize high-resolution, high-quality images with strong text-image alignment at a remarkably fast speed, deployable on laptop GPU. Core designs include: (1) Deep compression autoencoder: unlike traditional AEs, which compress images only 8×, we trained an AE that can compress images 32×, effectively reducing the number of latent tokens. (2) Linear DiT: we replace all vanilla attention in DiT with linear attention, which is more efficient at high resolutions without sacrificing quality. (3) Decoder-only text encoder: we replaced T5 with modern decoder-only small LLM as the text encoder and designed complex human instruction with in-context learning to enhance the image-text alignment. (4) Efficient training and sampling: we propose Flow-DPM-Solver to reduce sampling steps, with efficient caption labeling and selection to accelerate convergence. As a result, Sana-0.6B is very competitive with modern giant diffusion model (e.g. Flux-12B), being 20 times smaller and 100+ times faster in measured throughput. Moreover, Sana-0.6B can be deployed on a 16GB laptop GPU, taking less than 1 second to generate a 1024×1024 resolution image. Sana enables content creation at low cost. Code and model will be publicly released.*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-This pipeline was contributed by [lawrence-cj](https://github.com/lawrence-cj) and [chenjy2003](https://github.com/chenjy2003). The original codebase can be found [here](https://github.com/NVlabs/Sana). The original weights can be found under [hf.co/Efficient-Large-Model](https://huggingface.co/Efficient-Large-Model).
-
-Available models:
-
-| Model | Recommended dtype |
-|:-----:|:-----------------:|
-| [`Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers) | `torch.bfloat16` |
-| [`Efficient-Large-Model/Sana_1600M_1024px_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_diffusers) | `torch.float16` |
-| [`Efficient-Large-Model/Sana_1600M_1024px_MultiLing_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_1600M_1024px_MultiLing_diffusers) | `torch.float16` |
-| [`Efficient-Large-Model/Sana_1600M_512px_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_1600M_512px_diffusers) | `torch.float16` |
-| [`Efficient-Large-Model/Sana_1600M_512px_MultiLing_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_1600M_512px_MultiLing_diffusers) | `torch.float16` |
-| [`Efficient-Large-Model/Sana_600M_1024px_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_600M_1024px_diffusers) | `torch.float16` |
-| [`Efficient-Large-Model/Sana_600M_512px_diffusers`](https://huggingface.co/Efficient-Large-Model/Sana_600M_512px_diffusers) | `torch.float16` |
-
-Refer to [this](https://huggingface.co/collections/Efficient-Large-Model/sana-673efba2a57ed99843f11f9e) collection for more information.
-
-Note: The recommended dtype mentioned is for the transformer weights. The text encoder and VAE weights must stay in `torch.bfloat16` or `torch.float32` for the model to work correctly. Please refer to the inference example below to see how to load the model with the recommended dtype. 
-
-<Tip>
-
-Make sure to pass the `variant` argument for downloaded checkpoints to use lower disk space. Set it to `"fp16"` for models with recommended dtype as `torch.float16`, and `"bf16"` for models with recommended dtype as `torch.bfloat16`. By default, `torch.float32` weights are downloaded, which use twice the amount of disk storage. Additionally, `torch.float32` weights can be downcasted on-the-fly by specifying the `torch_dtype` argument. Read about it in the [docs](https://huggingface.co/docs/diffusers/v0.31.0/en/api/pipelines/overview#diffusers.DiffusionPipeline.from_pretrained).
-
-</Tip>
-
-## SanaPipeline
-
-[[autodoc]] SanaPipeline
-  - all
-  - __call__
-
-## SanaPAGPipeline
-
-[[autodoc]] SanaPAGPipeline
-  - all
-  - __call__
-
-## SanaPipelineOutput
-
-[[autodoc]] pipelines.sana.pipeline_output.SanaPipelineOutput
@@ -59,76 +59,9 @@ image.save("sd3_hello_world.png")
 - [`stabilityai/stable-diffusion-3.5-large`](https://huggingface.co/stabilityai/stable-diffusion-3-5-large)
 - [`stabilityai/stable-diffusion-3.5-large-turbo`](https://huggingface.co/stabilityai/stable-diffusion-3-5-large-turbo)

-## Image Prompting with IP-Adapters
-
-An IP-Adapter lets you prompt SD3 with images, in addition to the text prompt. This is especially useful when describing complex concepts that are difficult to articulate through text alone and you have reference images. To load and use an IP-Adapter, you need:
-
- `image_encoder`: Pre-trained vision model used to obtain image features, usually a CLIP image encoder.
- `feature_extractor`: Image processor that prepares the input image for the chosen `image_encoder`.
- `ip_adapter_id`: Checkpoint containing parameters of image cross attention layers and image projection. 
-
-IP-Adapters are trained for a specific model architecture, so they also work in finetuned variations of the base model. You can use the [`~SD3IPAdapterMixin.set_ip_adapter_scale`] function to adjust how strongly the output aligns with the image prompt. The higher the value, the more closely the model follows the image prompt. A default value of 0.5 is typically a good balance, ensuring the model considers both the text and image prompts equally.
-
-```python
-import torch
-from PIL import Image
-
-from diffusers import StableDiffusion3Pipeline
-from transformers import SiglipVisionModel, SiglipImageProcessor
-
-image_encoder_id = "google/siglip-so400m-patch14-384"
-ip_adapter_id = "InstantX/SD3.5-Large-IP-Adapter"
-
-feature_extractor = SiglipImageProcessor.from_pretrained(
-    image_encoder_id,
-    torch_dtype=torch.float16
-)
-image_encoder = SiglipVisionModel.from_pretrained(
-    image_encoder_id,
-    torch_dtype=torch.float16
-).to( "cuda")
-
-pipe = StableDiffusion3Pipeline.from_pretrained(
-    "stabilityai/stable-diffusion-3.5-large",
-    torch_dtype=torch.float16,
-    feature_extractor=feature_extractor,
-    image_encoder=image_encoder,
-).to("cuda")
-
-pipe.load_ip_adapter(ip_adapter_id)
-pipe.set_ip_adapter_scale(0.6)
-
-ref_img = Image.open("image.jpg").convert('RGB')
-
-image = pipe(
-    width=1024,
-    height=1024,
-    prompt="a cat",
-    negative_prompt="lowres, low quality, worst quality",
-    num_inference_steps=24,
-    guidance_scale=5.0,
-    ip_adapter_image=ref_img
-).images[0]
-
-image.save("result.jpg")
-```
-
-<div class="justify-center">
-    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sd3_ip_adapter_example.png"/>
-    <figcaption class="mt-2 text-sm text-center text-gray-500">IP-Adapter examples with prompt "a cat"</figcaption>
-</div>
-
-
-<Tip>
-
-Check out [IP-Adapter](../../../using-diffusers/ip_adapter) to learn more about how IP-Adapters work.
-
-</Tip>
-
-
 ## Memory Optimisations for SD3

-SD3 uses three text encoders, one of which is the very large T5-XXL model. This makes it challenging to run the model on GPUs with less than 24GB of VRAM, even when using `fp16` precision. The following section outlines a few memory optimizations in Diffusers that make it easier to run SD3 on low resource hardware.
+SD3 uses three text encoders, one if which is the very large T5-XXL model. This makes it challenging to run the model on GPUs with less than 24GB of VRAM, even when using `fp16` precision. The following section outlines a few memory optimizations in Diffusers that make it easier to run SD3 on low resource hardware.

 ### Running Inference with Model Offloading

@@ -28,13 +28,6 @@ Learn how to quantize models in the [Quantization](../quantization/overview) gui

 [[autodoc]] BitsAndBytesConfig

-## GGUFQuantizationConfig
-
-[[autodoc]] GGUFQuantizationConfig
-## TorchAoConfig
-
-[[autodoc]] TorchAoConfig
-
 ## DiffusersQuantizer

 [[autodoc]] quantizers.base.DiffusersQuantizer
@@ -17,12 +17,6 @@ specific language governing permissions and limitations under the License.

 4-bit quantization compresses a model even further, and it is commonly used with [QLoRA](https://hf.co/papers/2305.14314) to finetune quantized LLMs.

-This guide demonstrates how quantization can enable running
-[FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev)
-on less than 16GB of VRAM and even on a free Google
-Colab instance.
-
-![comparison image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/quant-bnb/comparison.png)

 To use bitsandbytes, make sure you have the following libraries installed:

@@ -37,167 +31,70 @@ Now you can quantize a model by passing a [`BitsAndBytesConfig`] to [`~ModelMixi

 Quantizing a model in 8-bit halves the memory-usage:

-bitsandbytes is supported in both Transformers and Diffusers, so you can quantize both the
-[`FluxTransformer2DModel`] and [`~transformers.T5EncoderModel`].
+```py
+from diffusers import FluxTransformer2DModel, BitsAndBytesConfig

-For Ada and higher-series GPUs. we recommend changing `torch_dtype` to `torch.bfloat16`.
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)

-> [!TIP]
-> The [`CLIPTextModel`] and [`AutoencoderKL`] aren't quantized because they're already small in size and because [`AutoencoderKL`] only has a few `torch.nn.Linear` layers.
+model_8bit = FluxTransformer2DModel.from_pretrained(
+    "black-forest-labs/FLUX.1-dev", 
+    subfolder="transformer",
+    quantization_config=quantization_config
+)
+```
+
+By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:

 ```py
-from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
-from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
+from diffusers import FluxTransformer2DModel, BitsAndBytesConfig

-from diffusers import FluxTransformer2DModel
-from transformers import T5EncoderModel
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)

-quant_config = TransformersBitsAndBytesConfig(load_in_8bit=True,)
-
-text_encoder_2_8bit = T5EncoderModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    subfolder="text_encoder_2",
-    quantization_config=quant_config,
-    torch_dtype=torch.float16,
-)
-
-quant_config = DiffusersBitsAndBytesConfig(load_in_8bit=True,)
-
-transformer_8bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
+model_8bit = FluxTransformer2DModel.from_pretrained(
+    "black-forest-labs/FLUX.1-dev", 
    subfolder="transformer",
-    quantization_config=quant_config,
-    torch_dtype=torch.float16,
+    quantization_config=quantization_config,
+    torch_dtype=torch.float32
 )
+model_8bit.transformer_blocks.layers[-1].norm2.weight.dtype
 ```

-By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter.
-
-```diff
-transformer_8bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    subfolder="transformer",
-    quantization_config=quant_config,
-+   torch_dtype=torch.float32,
-)
-```
-
-Let's generate an image using our quantized models.
-
-Setting `device_map="auto"` automatically fills all available space on the GPU(s) first, then the
-CPU, and finally, the hard drive (the absolute slowest option) if there is still not enough memory.
-
-```py
-pipe = FluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    transformer=transformer_8bit,
-    text_encoder_2=text_encoder_2_8bit,
-    torch_dtype=torch.float16,
-    device_map="auto",
-)
-
-pipe_kwargs = {
-    "prompt": "A cat holding a sign that says hello world",
-    "height": 1024,
-    "width": 1024,
-    "guidance_scale": 3.5,
-    "num_inference_steps": 50,
-    "max_sequence_length": 512,
-}
-
-image = pipe(**pipe_kwargs, generator=torch.manual_seed(0),).images[0]
-```
-
-<div class="flex justify-center">
-   <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/quant-bnb/8bit.png"/>
-</div>
-
-When there is enough memory, you can also directly move the pipeline to the GPU with `.to("cuda")` and apply [`~DiffusionPipeline.enable_model_cpu_offload`] to optimize GPU memory usage.
-
-Once a model is quantized, you can push the model to the Hub with the [`~ModelMixin.push_to_hub`] method. The quantization `config.json` file is pushed first, followed by the quantized model weights. You can also save the serialized 8-bit models locally with [`~ModelMixin.save_pretrained`].
+Once a model is quantized, you can push the model to the Hub with the [`~ModelMixin.push_to_hub`] method. The quantization `config.json` file is pushed first, followed by the quantized model weights. You can also save the serialized 4-bit models locally with [`~ModelMixin.save_pretrained`].

 </hfoption>
 <hfoption id="4-bit">

 Quantizing a model in 4-bit reduces your memory-usage by 4x:

-bitsandbytes is supported in both Transformers and Diffusers, so you can can quantize both the
-[`FluxTransformer2DModel`] and [`~transformers.T5EncoderModel`].
+```py
+from diffusers import FluxTransformer2DModel, BitsAndBytesConfig

-For Ada and higher-series GPUs. we recommend changing `torch_dtype` to `torch.bfloat16`.
+quantization_config = BitsAndBytesConfig(load_in_4bit=True)

-> [!TIP]
-> The [`CLIPTextModel`] and [`AutoencoderKL`] aren't quantized because they're already small in size and because [`AutoencoderKL`] only has a few `torch.nn.Linear` layers.
+model_4bit = FluxTransformer2DModel.from_pretrained(
+    "black-forest-labs/FLUX.1-dev", 
+    subfolder="transformer",
+    quantization_config=quantization_config
+)
+```
+
+By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:

 ```py
-from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
-from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
+from diffusers import FluxTransformer2DModel, BitsAndBytesConfig

-from diffusers import FluxTransformer2DModel
-from transformers import T5EncoderModel
+quantization_config = BitsAndBytesConfig(load_in_4bit=True)

-quant_config = TransformersBitsAndBytesConfig(load_in_4bit=True,)
-
-text_encoder_2_4bit = T5EncoderModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    subfolder="text_encoder_2",
-    quantization_config=quant_config,
-    torch_dtype=torch.float16,
-)
-
-quant_config = DiffusersBitsAndBytesConfig(load_in_4bit=True,)
-
-transformer_4bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
+model_4bit = FluxTransformer2DModel.from_pretrained(
+    "black-forest-labs/FLUX.1-dev", 
    subfolder="transformer",
-    quantization_config=quant_config,
-    torch_dtype=torch.float16,
+    quantization_config=quantization_config,
+    torch_dtype=torch.float32
 )
+model_4bit.transformer_blocks.layers[-1].norm2.weight.dtype
 ```

-By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter.
-
-```diff
-transformer_4bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    subfolder="transformer",
-    quantization_config=quant_config,
-+   torch_dtype=torch.float32,
-)
-```
-
-Let's generate an image using our quantized models.
-
-Setting `device_map="auto"` automatically fills all available space on the GPU(s) first, then the CPU, and finally, the hard drive (the absolute slowest option) if there is still not enough memory.
-
-```py
-pipe = FluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    transformer=transformer_4bit,
-    text_encoder_2=text_encoder_2_4bit,
-    torch_dtype=torch.float16,
-    device_map="auto",
-)
-
-pipe_kwargs = {
-    "prompt": "A cat holding a sign that says hello world",
-    "height": 1024,
-    "width": 1024,
-    "guidance_scale": 3.5,
-    "num_inference_steps": 50,
-    "max_sequence_length": 512,
-}
-
-image = pipe(**pipe_kwargs, generator=torch.manual_seed(0),).images[0]
-```
-
-<div class="flex justify-center">
-   <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/quant-bnb/4bit.png"/>
-</div>
-
-When there is enough memory, you can also directly move the pipeline to the GPU with `.to("cuda")` and apply [`~DiffusionPipeline.enable_model_cpu_offload`] to optimize GPU memory usage.
-
-Once a model is quantized, you can push the model to the Hub with the [`~ModelMixin.push_to_hub`] method. The quantization `config.json` file is pushed first, followed by the quantized model weights. You can also save the serialized 4-bit models locally with [`~ModelMixin.save_pretrained`].
+Call [`~ModelMixin.push_to_hub`] after loading it in 4-bit precision. You can also save the serialized 4-bit models locally with [`~ModelMixin.save_pretrained`].  

 </hfoption>
 </hfoptions>
@@ -302,34 +199,17 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dty
 NF4 is a 4-bit data type from the [QLoRA](https://hf.co/papers/2305.14314) paper, adapted for weights initialized from a normal distribution. You should use NF4 for training 4-bit base models. This can be configured with the `bnb_4bit_quant_type` parameter in the [`BitsAndBytesConfig`]:

 ```py
-from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
-from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
+from diffusers import BitsAndBytesConfig

-from diffusers import FluxTransformer2DModel
-from transformers import T5EncoderModel
-
-quant_config = TransformersBitsAndBytesConfig(
+nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
 )

-text_encoder_2_4bit = T5EncoderModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    subfolder="text_encoder_2",
-    quantization_config=quant_config,
-    torch_dtype=torch.float16,
-)
-
-quant_config = DiffusersBitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-)
-
-transformer_4bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
+model_nf4 = SD3Transformer2DModel.from_pretrained(
+    "stabilityai/stable-diffusion-3-medium-diffusers",
    subfolder="transformer",
-    quantization_config=quant_config,
-    torch_dtype=torch.float16,
+    quantization_config=nf4_config,
 )
 ```

@@ -340,74 +220,38 @@ For inference, the `bnb_4bit_quant_type` does not have a huge impact on performa
 Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an additional 0.4 bits/parameter. 

 ```py
-from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
-from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
+from diffusers import BitsAndBytesConfig

-from diffusers import FluxTransformer2DModel
-from transformers import T5EncoderModel
-
-quant_config = TransformersBitsAndBytesConfig(
+double_quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
 )

-text_encoder_2_4bit = T5EncoderModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    subfolder="text_encoder_2",
-    quantization_config=quant_config,
-    torch_dtype=torch.float16,
-)
-
-quant_config = DiffusersBitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-)
-
-transformer_4bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
+double_quant_model = SD3Transformer2DModel.from_pretrained(
+    "stabilityai/stable-diffusion-3-medium-diffusers",
    subfolder="transformer",
-    quantization_config=quant_config,
-    torch_dtype=torch.float16,
+    quantization_config=double_quant_config,
 )
 ```

 ## Dequantizing `bitsandbytes` models

-Once quantized, you can dequantize a model to its original precision, but this might result in a small loss of quality. Make sure you have enough GPU RAM to fit the dequantized model. 
+Once quantized, you can dequantize the model to the original precision but this might result in a small quality loss of the model. Make sure you have enough GPU RAM to fit the dequantized model. 

 ```python
-from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
-from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
+from diffusers import BitsAndBytesConfig

-from diffusers import FluxTransformer2DModel
-from transformers import T5EncoderModel
-
-quant_config = TransformersBitsAndBytesConfig(
+double_quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
 )

-text_encoder_2_4bit = T5EncoderModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    subfolder="text_encoder_2",
-    quantization_config=quant_config,
-    torch_dtype=torch.float16,
-)
-
-quant_config = DiffusersBitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-)
-
-transformer_4bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
+double_quant_model = SD3Transformer2DModel.from_pretrained(
+    "stabilityai/stable-diffusion-3-medium-diffusers",
    subfolder="transformer",
-    quantization_config=quant_config,
-    torch_dtype=torch.float16,
+    quantization_config=double_quant_config,
 )
-
-text_encoder_2_4bit.dequantize()
-transformer_4bit.dequantize()
+model.dequantize()
 ```

 ## Resources
@@ -1,69 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-->
-
-# GGUF
-
-The GGUF file format is typically used to store models for inference with [GGML](https://github.com/ggerganov/ggml) and supports a variety of block wise quantization options. Diffusers supports loading checkpoints prequantized and saved in the GGUF format via `from_single_file` loading with Model classes. Loading GGUF checkpoints via Pipelines is currently not supported.
-
-The following example will load the [FLUX.1 DEV](https://huggingface.co/black-forest-labs/FLUX.1-dev) transformer model using the GGUF Q2_K quantization variant.
-
-Before starting please install gguf in your environment
-
-```shell
-pip install -U gguf
-```
-
-Since GGUF is a single file format, use [`~FromSingleFileMixin.from_single_file`] to load the model and pass in the [`GGUFQuantizationConfig`].
-
-When using GGUF checkpoints, the quantized weights remain in a low memory `dtype`(typically `torch.uint8`) and are dynamically dequantized and cast to the configured `compute_dtype` during each module's forward pass through the model. The `GGUFQuantizationConfig` allows you to set the `compute_dtype`.
-
-The functions used for dynamic dequantizatation are based on the great work done by [city96](https://github.com/city96/ComfyUI-GGUF), who created the Pytorch ports of the original [`numpy`](https://github.com/ggerganov/llama.cpp/blob/master/gguf-py/gguf/quants.py) implementation by [compilade](https://github.com/compilade).
-
-```python
-import torch
-
-from diffusers import FluxPipeline, FluxTransformer2DModel, GGUFQuantizationConfig
-
-ckpt_path = (
-    "https://huggingface.co/city96/FLUX.1-dev-gguf/blob/main/flux1-dev-Q2_K.gguf"
-)
-transformer = FluxTransformer2DModel.from_single_file(
-    ckpt_path,
-    quantization_config=GGUFQuantizationConfig(compute_dtype=torch.bfloat16),
-    torch_dtype=torch.bfloat16,
-)
-pipe = FluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    transformer=transformer,
-    torch_dtype=torch.bfloat16,
-)
-pipe.enable_model_cpu_offload()
-prompt = "A cat holding a sign that says hello world"
-image = pipe(prompt, generator=torch.manual_seed(0)).images[0]
-image.save("flux-gguf.png")
-```
-
-## Supported Quantization Types
-
- BF16
- Q4_0
- Q4_1
- Q5_0
- Q5_1
- Q8_0
- Q2_K
- Q3_K
- Q4_K
- Q5_K
- Q6_K
-
@@ -17,7 +17,7 @@ Quantization techniques focus on representing data with less information while a

 <Tip>

-Interested in adding a new quantization method to Diffusers? Refer to the [Contribute new quantization method guide](https://huggingface.co/docs/transformers/main/en/quantization/contribute) to learn more about adding a new quantization method.
+Interested in adding a new quantization method to Transformers? Refer to the [Contribute new quantization method guide](https://huggingface.co/docs/transformers/main/en/quantization/contribute) to learn more about adding a new quantization method.

 </Tip>

@@ -32,9 +32,4 @@ If you are new to the quantization field, we recommend you to check out these be

 ## When to use what?

-Diffusers currently supports the following quantization methods.
- [BitsandBytes](./bitsandbytes)
- [TorchAO](./torchao)
- [GGUF](./gguf)
-
-[This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques.
+This section will be expanded once Diffusers has multiple quantization backends. Currently, we only support `bitsandbytes`. [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques. 
@@ -1,156 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# torchao
-
-[TorchAO](https://github.com/pytorch/ao) is an architecture optimization library for PyTorch. It provides high-performance dtypes, optimization techniques, and kernels for inference and training, featuring composability with native PyTorch features like [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html), FullyShardedDataParallel (FSDP), and more.
-
-Before you begin, make sure you have Pytorch 2.5+ and TorchAO installed.
-
-```bash
-pip install -U torch torchao
-```
-
-
-Quantize a model by passing [`TorchAoConfig`] to [`~ModelMixin.from_pretrained`] (you can also load pre-quantized models). This works for any model in any modality, as long as it supports loading with [Accelerate](https://hf.co/docs/accelerate/index) and contains `torch.nn.Linear` layers.
-
-The example below only quantizes the weights to int8.
-
-```python
-import torch
-from diffusers import FluxPipeline, FluxTransformer2DModel, TorchAoConfig
-
-model_id = "black-forest-labs/FLUX.1-dev"
-dtype = torch.bfloat16
-
-quantization_config = TorchAoConfig("int8wo")
-transformer = FluxTransformer2DModel.from_pretrained(
-    model_id,
-    subfolder="transformer",
-    quantization_config=quantization_config,
-    torch_dtype=dtype,
-)
-pipe = FluxPipeline.from_pretrained(
-    model_id,
-    transformer=transformer,
-    torch_dtype=dtype,
-)
-pipe.to("cuda")
-
-# Without quantization: ~31.447 GB
-# With quantization: ~20.40 GB
-print(f"Pipeline memory usage: {torch.cuda.max_memory_reserved() / 1024**3:.3f} GB")
-
-prompt = "A cat holding a sign that says hello world"
-image = pipe(
-    prompt, num_inference_steps=50, guidance_scale=4.5, max_sequence_length=512
-).images[0]
-image.save("output.png")
-```
-
-TorchAO is fully compatible with [torch.compile](./optimization/torch2.0#torchcompile), setting it apart from other quantization methods. This makes it easy to speed up inference with just one line of code.
-
-```python
-# In the above code, add the following after initializing the transformer
-transformer = torch.compile(transformer, mode="max-autotune", fullgraph=True)
-```
-
-For speed and memory benchmarks on Flux and CogVideoX, please refer to the table [here](https://github.com/huggingface/diffusers/pull/10009#issue-2688781450). You can also find some torchao [benchmarks](https://github.com/pytorch/ao/tree/main/torchao/quantization#benchmarks) numbers for various hardware.
-
-torchao also supports an automatic quantization API through [autoquant](https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md#autoquantization). Autoquantization determines the best quantization strategy applicable to a model by comparing the performance of each technique on chosen input types and shapes. Currently, this can be used directly on the underlying modeling components. Diffusers will also expose an autoquant configuration option in the future.
-
-The `TorchAoConfig` class accepts three parameters:
- `quant_type`: A string value mentioning one of the quantization types below.
- `modules_to_not_convert`: A list of module full/partial module names for which quantization should not be performed. For example, to not perform any quantization of the [`FluxTransformer2DModel`]'s first block, one would specify: `modules_to_not_convert=["single_transformer_blocks.0"]`.
- `kwargs`: A dict of keyword arguments to pass to the underlying quantization method which will be invoked based on `quant_type`.
-
-## Supported quantization types
-
-torchao supports weight-only quantization and weight and dynamic-activation quantization for int8, float3-float8, and uint1-uint7.
-
-Weight-only quantization stores the model weights in a specific low-bit data type but performs computation with a higher-precision data type, like `bfloat16`. This lowers the memory requirements from model weights but retains the memory peaks for activation computation.
-
-Dynamic activation quantization stores the model weights in a low-bit dtype, while also quantizing the activations on-the-fly to save additional memory. This lowers the memory requirements from model weights, while also lowering the memory overhead from activation computations. However, this may come at a quality tradeoff at times, so it is recommended to test different models thoroughly.
-
-The quantization methods supported are as follows:
-
-| **Category** | **Full Function Names** | **Shorthands** |
-|--------------|-------------------------|----------------|
-| **Integer quantization** | `int4_weight_only`, `int8_dynamic_activation_int4_weight`, `int8_weight_only`, `int8_dynamic_activation_int8_weight` | `int4wo`, `int4dq`, `int8wo`, `int8dq` |
-| **Floating point 8-bit quantization** | `float8_weight_only`, `float8_dynamic_activation_float8_weight`, `float8_static_activation_float8_weight` | `float8wo`, `float8wo_e5m2`, `float8wo_e4m3`, `float8dq`, `float8dq_e4m3`, `float8_e4m3_tensor`, `float8_e4m3_row` |
-| **Floating point X-bit quantization** | `fpx_weight_only` | `fpX_eAwB` where `X` is the number of bits (1-7), `A` is exponent bits, and `B` is mantissa bits. Constraint: `X == A + B + 1` |
-| **Unsigned Integer quantization** | `uintx_weight_only` | `uint1wo`, `uint2wo`, `uint3wo`, `uint4wo`, `uint5wo`, `uint6wo`, `uint7wo` |
-
-Some quantization methods are aliases (for example, `int8wo` is the commonly used shorthand for `int8_weight_only`). This allows using the quantization methods described in the torchao docs as-is, while also making it convenient to remember their shorthand notations.
-
-Refer to the official torchao documentation for a better understanding of the available quantization methods and the exhaustive list of configuration options available.
-
-## Serializing and Deserializing quantized models
-
-To serialize a quantized model in a given dtype, first load the model with the desired quantization dtype and then save it using the [`~ModelMixin.save_pretrained`] method.
-
-```python
-import torch
-from diffusers import FluxTransformer2DModel, TorchAoConfig
-
-quantization_config = TorchAoConfig("int8wo")
-transformer = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/Flux.1-Dev",
-    subfolder="transformer",
-    quantization_config=quantization_config,
-    torch_dtype=torch.bfloat16,
-)
-transformer.save_pretrained("/path/to/flux_int8wo", safe_serialization=False)
-```
-
-To load a serialized quantized model, use the [`~ModelMixin.from_pretrained`] method.
-
-```python
-import torch
-from diffusers import FluxPipeline, FluxTransformer2DModel
-
-transformer = FluxTransformer2DModel.from_pretrained("/path/to/flux_int8wo", torch_dtype=torch.bfloat16, use_safetensors=False)
-pipe = FluxPipeline.from_pretrained("black-forest-labs/Flux.1-Dev", transformer=transformer, torch_dtype=torch.bfloat16)
-pipe.to("cuda")
-
-prompt = "A cat holding a sign that says hello world"
-image = pipe(prompt, num_inference_steps=30, guidance_scale=7.0).images[0]
-image.save("output.png")
-```
-
-Some quantization methods, such as `uint4wo`, cannot be loaded directly and may result in an `UnpicklingError` when trying to load the models, but work as expected when saving them. In order to work around this, one can load the state dict manually into the model. Note, however, that this requires using `weights_only=False` in `torch.load`, so it should be run only if the weights were obtained from a trustable source.
-
-```python
-import torch
-from accelerate import init_empty_weights
-from diffusers import FluxPipeline, FluxTransformer2DModel, TorchAoConfig
-
-# Serialize the model
-transformer = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/Flux.1-Dev",
-    subfolder="transformer",
-    quantization_config=TorchAoConfig("uint4wo"),
-    torch_dtype=torch.bfloat16,
-)
-transformer.save_pretrained("/path/to/flux_uint4wo", safe_serialization=False, max_shard_size="50GB")
-# ...
-
-# Load the model
-state_dict = torch.load("/path/to/flux_uint4wo/diffusion_pytorch_model.bin", weights_only=False, map_location="cpu")
-with init_empty_weights():
-    transformer = FluxTransformer2DModel.from_config("/path/to/flux_uint4wo/config.json")
-transformer.load_state_dict(state_dict, strict=True, assign=True)
-```
-
-## Resources
-
- [TorchAO Quantization API](https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md)
- [Diffusers-TorchAO examples](https://github.com/sayakpaul/diffusers-torchao)
@@ -56,7 +56,7 @@ image

 With the `adapter_name` parameter, it is really easy to use another adapter for inference! Load the [nerijs/pixel-art-xl](https://huggingface.co/nerijs/pixel-art-xl) adapter that has been fine-tuned to generate pixel art images and call it `"pixel"`.

-The pipeline automatically sets the first loaded adapter (`"toy"`) as the active adapter, but you can activate the `"pixel"` adapter with the [`~PeftAdapterMixin.set_adapters`] method:
+The pipeline automatically sets the first loaded adapter (`"toy"`) as the active adapter, but you can activate the `"pixel"` adapter with the [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`] method:

 ```python
 pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
@@ -85,7 +85,7 @@ By default, if the most up-to-date versions of PEFT and Transformers are detecte

 You can also merge different adapter checkpoints for inference to blend their styles together.

-Once again, use the [`~PeftAdapterMixin.set_adapters`] method to activate the `pixel` and `toy` adapters and specify the weights for how they should be merged.
+Once again, use the [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`] method to activate the `pixel` and `toy` adapters and specify the weights for how they should be merged.

 ```python
 pipe.set_adapters(["pixel", "toy"], adapter_weights=[0.5, 1.0])
@@ -114,7 +114,7 @@ Impressive! As you can see, the model generated an image that mixed the characte
 > [!TIP]
 > Through its PEFT integration, Diffusers also offers more efficient merging methods which you can learn about in the [Merge LoRAs](../using-diffusers/merge_loras) guide!

-To return to only using one adapter, use the [`~PeftAdapterMixin.set_adapters`] method to activate the `"toy"` adapter:
+To return to only using one adapter, use the [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`] method to activate the `"toy"` adapter:

 ```python
 pipe.set_adapters("toy")
@@ -127,7 +127,7 @@ image = pipe(
 image
 ```

-Or to disable all adapters entirely, use the [`~PeftAdapterMixin.disable_lora`] method to return the base model.
+Or to disable all adapters entirely, use the [`~diffusers.loaders.UNet2DConditionLoadersMixin.disable_lora`] method to return the base model.

 ```python
 pipe.disable_lora()
@@ -140,8 +140,7 @@ image
 ![no-lora](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_20_1.png)

 ### Customize adapters strength
-
-For even more customization, you can control how strongly the adapter affects each part of the pipeline. For this, pass a dictionary with the control strengths (called "scales") to [`~PeftAdapterMixin.set_adapters`].
+For even more customization, you can control how strongly the adapter affects each part of the pipeline. For this, pass a dictionary with the control strengths (called "scales") to [`~diffusers.loaders.UNet2DConditionLoadersMixin.set_adapters`].

 For example, here's how you can turn on the adapter for the `down` parts, but turn it off for the `mid` and `up` parts:
 ```python
@@ -196,7 +195,7 @@ image

 ![block-lora-mixed](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/peft_integration/diffusers_peft_lora_inference_block_mixed.png)

-## Manage adapters
+## Manage active adapters

 You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, use the [`~diffusers.loaders.StableDiffusionLoraLoaderMixin.get_active_adapters`] method to check the list of active adapters:

@@ -213,11 +212,3 @@ list_adapters_component_wise = pipe.get_list_adapters()
 list_adapters_component_wise
 {"text_encoder": ["toy", "pixel"], "unet": ["toy", "pixel"], "text_encoder_2": ["toy", "pixel"]}
 ```
-
-The [`~PeftAdapterMixin.delete_adapters`] function completely removes an adapter and their LoRA layers from a model.
-
-```py
-pipe.delete_adapters("toy")
-pipe.get_active_adapters()
-["pixel"]
-```
@@ -134,16 +134,14 @@ The [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method loads L
 - the LoRA weights don't have separate identifiers for the UNet and text encoder
 - the LoRA weights have separate identifiers for the UNet and text encoder

-To directly load (and save) a LoRA adapter at the *model-level*, use [`~PeftAdapterMixin.load_lora_adapter`], which builds and prepares the necessary model configuration for the adapter. Like [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`], [`PeftAdapterMixin.load_lora_adapter`] can load LoRAs for both the UNet and text encoder. For example, if you're loading a LoRA for the UNet, [`PeftAdapterMixin.load_lora_adapter`] ignores the keys for the text encoder.
-
-Use the `weight_name` parameter to specify the specific weight file and the `prefix` parameter to filter for the appropriate state dicts (`"unet"` in this case) to load.
+But if you only need to load LoRA weights into the UNet, then you can use the [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`] method. Let's load the [jbilcke-hf/sdxl-cinematic-1](https://huggingface.co/jbilcke-hf/sdxl-cinematic-1) LoRA:

 ```py
 from diffusers import AutoPipelineForText2Image
 import torch

 pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
-pipeline.unet.load_lora_adapter("jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", prefix="unet")
+pipeline.unet.load_attn_procs("jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors")

 # use cnmt in the prompt to trigger the LoRA
 prompt = "A cute cnmt eating a slice of pizza, stunning color scheme, masterpiece, illustration"
@@ -155,8 +153,6 @@ image
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_attn_proc.png" />
 </div>

-Save an adapter with [`~PeftAdapterMixin.save_lora_adapter`].
-
 To unload the LoRA weights, use the [`~loaders.StableDiffusionLoraLoaderMixin.unload_lora_weights`] method to discard the LoRA weights and restore the model to its original weights:

 ```py
@@ -74,7 +74,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -73,7 +73,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -79,7 +79,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -61,7 +61,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -872,9 +872,10 @@ def prepare_rotary_positional_embeddings(
        crops_coords=grid_crops_coords,
        grid_size=(grid_height, grid_width),
        temporal_size=num_frames,
-        device=device,
    )

+    freqs_cos = freqs_cos.to(device=device)
+    freqs_sin = freqs_sin.to(device=device)
    return freqs_cos, freqs_sin


@@ -52,7 +52,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -894,9 +894,10 @@ def prepare_rotary_positional_embeddings(
        crops_coords=grid_crops_coords,
        grid_size=(grid_height, grid_width),
        temporal_size=num_frames,
-        device=device,
    )

+    freqs_cos = freqs_cos.to(device=device)
+    freqs_sin = freqs_sin.to(device=device)
    return freqs_cos, freqs_sin


@@ -241,15 +241,27 @@ from diffusers import StableDiffusionPipeline
 from diffusers.callbacks import PipelineCallback, MultiPipelineCallbacks
 from diffusers.configuration_utils import register_to_config
 import torch
-from typing import Any, Dict, Tuple, Union
+from typing import Any, Dict, Optional


-class SDPromptSchedulingCallback(PipelineCallback):
+pipeline: StableDiffusionPipeline = StableDiffusionPipeline.from_pretrained(
+    "stable-diffusion-v1-5/stable-diffusion-v1-5",
+    torch_dtype=torch.float16,
+    variant="fp16",
+    use_safetensors=True,
+).to("cuda")
+pipeline.safety_checker = None
+pipeline.requires_safety_checker = False
+
+
+class SDPromptScheduleCallback(PipelineCallback):
    @register_to_config
    def __init__(
        self,
-        encoded_prompt: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        cutoff_step_ratio=None,
+        prompt: str,
+        negative_prompt: Optional[str] = None,
+        num_images_per_prompt: int = 1,
+        cutoff_step_ratio=1.0,
        cutoff_step_index=None,
    ):
        super().__init__(
@@ -263,10 +275,6 @@ class SDPromptSchedulingCallback(PipelineCallback):
    ) -> Dict[str, Any]:
        cutoff_step_ratio = self.config.cutoff_step_ratio
        cutoff_step_index = self.config.cutoff_step_index
-        if isinstance(self.config.encoded_prompt, tuple):
-            prompt_embeds, negative_prompt_embeds = self.config.encoded_prompt
-        else:
-            prompt_embeds = self.config.encoded_prompt

        # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
        cutoff_step = (
@@ -276,164 +284,34 @@ class SDPromptSchedulingCallback(PipelineCallback):
        )

        if step_index == cutoff_step:
+            prompt_embeds, negative_prompt_embeds = pipeline.encode_prompt(
+                prompt=self.config.prompt,
+                negative_prompt=self.config.negative_prompt,
+                device=pipeline._execution_device,
+                num_images_per_prompt=self.config.num_images_per_prompt,
+                do_classifier_free_guidance=pipeline.do_classifier_free_guidance,
+            )
            if pipeline.do_classifier_free_guidance:
                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
            callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
        return callback_kwargs

-
-pipeline: StableDiffusionPipeline = StableDiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5",
-    torch_dtype=torch.float16,
-    variant="fp16",
-    use_safetensors=True,
-).to("cuda")
-pipeline.safety_checker = None
-pipeline.requires_safety_checker = False
-
 callback = MultiPipelineCallbacks(
    [
-        SDPromptSchedulingCallback(
-            encoded_prompt=pipeline.encode_prompt(
-                prompt=f"prompt {index}",
-                negative_prompt=f"negative prompt {index}",
-                device=pipeline._execution_device,
-                num_images_per_prompt=1,
-                # pipeline.do_classifier_free_guidance can't be accessed until after pipeline is ran
-                do_classifier_free_guidance=True,
-            ),
-            cutoff_step_index=index,
-        ) for index in range(1, 20)
+        SDPromptScheduleCallback(
+            prompt="Official portrait of a smiling world war ii general, female, cheerful, happy, detailed face, 20th century, highly detailed, cinematic lighting, digital art painting by Greg Rutkowski",
+            negative_prompt="Deformed, ugly, bad anatomy",
+            cutoff_step_ratio=0.25,
+        )
    ]
 )

 image = pipeline(
-    prompt="prompt"
-    negative_prompt="negative prompt",
+    prompt="Official portrait of a smiling world war ii general, male, cheerful, happy, detailed face, 20th century, highly detailed, cinematic lighting, digital art painting by Greg Rutkowski",
+    negative_prompt="Deformed, ugly, bad anatomy",
    callback_on_step_end=callback,
    callback_on_step_end_tensor_inputs=["prompt_embeds"],
 ).images[0]
 torch.cuda.empty_cache()
 image.save('image.png')
 ```
-
-```python
-from diffusers import StableDiffusionXLPipeline
-from diffusers.callbacks import PipelineCallback, MultiPipelineCallbacks
-from diffusers.configuration_utils import register_to_config
-import torch
-from typing import Any, Dict, Tuple, Union
-
-
-class SDXLPromptSchedulingCallback(PipelineCallback):
-    @register_to_config
-    def __init__(
-        self,
-        encoded_prompt: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        add_text_embeds: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        add_time_ids: Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]],
-        cutoff_step_ratio=None,
-        cutoff_step_index=None,
-    ):
-        super().__init__(
-            cutoff_step_ratio=cutoff_step_ratio, cutoff_step_index=cutoff_step_index
-        )
-
-    tensor_inputs = ["prompt_embeds", "add_text_embeds", "add_time_ids"]
-
-    def callback_fn(
-        self, pipeline, step_index, timestep, callback_kwargs
-    ) -> Dict[str, Any]:
-        cutoff_step_ratio = self.config.cutoff_step_ratio
-        cutoff_step_index = self.config.cutoff_step_index
-        if isinstance(self.config.encoded_prompt, tuple):
-            prompt_embeds, negative_prompt_embeds = self.config.encoded_prompt
-        else:
-            prompt_embeds = self.config.encoded_prompt
-        if isinstance(self.config.add_text_embeds, tuple):
-            add_text_embeds, negative_add_text_embeds = self.config.add_text_embeds
-        else:
-            add_text_embeds = self.config.add_text_embeds
-        if isinstance(self.config.add_time_ids, tuple):
-            add_time_ids, negative_add_time_ids = self.config.add_time_ids
-        else:
-            add_time_ids = self.config.add_time_ids
-
-        # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
-        cutoff_step = (
-            cutoff_step_index
-            if cutoff_step_index is not None
-            else int(pipeline.num_timesteps * cutoff_step_ratio)
-        )
-
-        if step_index == cutoff_step:
-            if pipeline.do_classifier_free_guidance:
-                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-                add_text_embeds = torch.cat([negative_add_text_embeds, add_text_embeds])
-                add_time_ids = torch.cat([negative_add_time_ids, add_time_ids])
-            callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
-            callback_kwargs[self.tensor_inputs[1]] = add_text_embeds
-            callback_kwargs[self.tensor_inputs[2]] = add_time_ids
-        return callback_kwargs
-
-
-pipeline: StableDiffusionXLPipeline = StableDiffusionXLPipeline.from_pretrained(
-    "stabilityai/stable-diffusion-xl-base-1.0",
-    torch_dtype=torch.float16,
-    variant="fp16",
-    use_safetensors=True,
-).to("cuda")
-
-callbacks = []
-for index in range(1, 20):
-    (
-        prompt_embeds,
-        negative_prompt_embeds,
-        pooled_prompt_embeds,
-        negative_pooled_prompt_embeds,
-    ) = pipeline.encode_prompt(
-        prompt=f"prompt {index}",
-        negative_prompt=f"prompt {index}",
-        device=pipeline._execution_device,
-        num_images_per_prompt=1,
-        # pipeline.do_classifier_free_guidance can't be accessed until after pipeline is ran
-        do_classifier_free_guidance=True,
-    )
-    text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
-    add_time_ids = pipeline._get_add_time_ids(
-        (1024, 1024),
-        (0, 0),
-        (1024, 1024),
-        dtype=prompt_embeds.dtype,
-        text_encoder_projection_dim=text_encoder_projection_dim,
-    )
-    negative_add_time_ids = pipeline._get_add_time_ids(
-        (1024, 1024),
-        (0, 0),
-        (1024, 1024),
-        dtype=prompt_embeds.dtype,
-        text_encoder_projection_dim=text_encoder_projection_dim,
-    )
-    callbacks.append(
-        SDXLPromptSchedulingCallback(
-            encoded_prompt=(prompt_embeds, negative_prompt_embeds),
-            add_text_embeds=(pooled_prompt_embeds, negative_pooled_prompt_embeds),
-            add_time_ids=(add_time_ids, negative_add_time_ids),
-            cutoff_step_index=index,
-        )
-    )
-
-
-callback = MultiPipelineCallbacks(callbacks)
-
-image = pipeline(
-    prompt="prompt",
-    negative_prompt="negative prompt",
-    callback_on_step_end=callback,
-    callback_on_step_end_tensor_inputs=[
-        "prompt_embeds",
-        "add_text_embeds",
-        "add_time_ids",
-    ],
-).images[0]
-```
@@ -43,7 +43,8 @@ from diffusers.utils import BaseOutput, check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")
+

 class MarigoldDepthOutput(BaseOutput):
    """
@@ -1008,8 +1008,6 @@ class HunyuanDiTDifferentialImg2ImgPipeline(DiffusionPipeline):
            self.transformer.inner_dim // self.transformer.num_heads,
            grid_crops_coords,
            (grid_height, grid_width),
-            device=device,
-            output_type="pt",
        )

        style = torch.tensor([0], device=device)
@@ -129,7 +129,7 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
        self.power = int(rp_args["power"]) if "power" in rp_args else 1

        prompts = prompt if isinstance(prompt, list) else [prompt]
-        n_prompts = negative_prompt if isinstance(negative_prompt, list) else [negative_prompt]
+        n_prompts = negative_prompt if isinstance(prompt, list) else [negative_prompt]
        self.batch = batch = num_images_per_prompt * len(prompts)

        if use_base:
@@ -73,7 +73,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -66,7 +66,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -79,7 +79,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -72,7 +72,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -78,7 +78,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -1,6 +1,6 @@
-# ControlNet training example for Stable Diffusion 3/3.5 (SD3/3.5)
+# ControlNet training example for Stable Diffusion 3 (SD3)

-The `train_controlnet_sd3.py` script shows how to implement the ControlNet training procedure and adapt it for [Stable Diffusion 3](https://arxiv.org/abs/2403.03206) and [Stable Diffusion 3.5](https://stability.ai/news/introducing-stable-diffusion-3-5).
+The `train_controlnet_sd3.py` script shows how to implement the ControlNet training procedure and adapt it for [Stable Diffusion 3](https://arxiv.org/abs/2403.03206).

 ## Running locally with PyTorch

@@ -51,9 +51,9 @@ Please download the dataset and unzip it in the directory `fill50k` in the `exam

 ## Training

-First download the SD3 model from [Hugging Face Hub](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers) or the SD3.5 model from [Hugging Face Hub](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium). We will use it as a base model for the ControlNet training.
+First download the SD3 model from [Hugging Face Hub](https://huggingface.co/stabilityai/stable-diffusion-3-medium). We will use it as a base model for the ControlNet training.
 > [!NOTE]
-> As the model is gated, before using it with diffusers you first need to go to the [Stable Diffusion 3 Medium Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers) or [Stable Diffusion 3.5 Large Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3.5-medium), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
+> As the model is gated, before using it with diffusers you first need to go to the [Stable Diffusion 3 Medium Hugging Face page](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:

 ```bash
 huggingface-cli login
@@ -90,8 +90,6 @@ accelerate launch train_controlnet_sd3.py \
    --gradient_accumulation_steps=4
 ```

-To train a ControlNet model for Stable Diffusion 3.5, replace the `MODEL_DIR` with `stabilityai/stable-diffusion-3.5-medium`.
-
 To better track our training experiments, we're using flags `validation_image`, `validation_prompt`, and `validation_steps` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.

 Our experiments were conducted on a single 40GB A100 GPU.
@@ -126,8 +124,6 @@ image = pipe(
 image.save("./output.png")
 ```

-Similarly, for SD3.5, replace the `base_model_path` with `stabilityai/stable-diffusion-3.5-medium` and controlnet_path `DavyMorgan/sd35-controlnet-out'.
-
 ## Notes

 ### GPU usage
@@ -139,8 +135,6 @@ Make sure to use the right GPU when configuring the [accelerator](https://huggin

 ## Example results

-### SD3
-
 #### After 500 steps with batch size 8

 | |  |
@@ -156,20 +150,3 @@ Make sure to use the right GPU when configuring the [accelerator](https://huggin
 || pale golden rod circle with old lace background |
 ![conditioning image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png) | ![pale golden rod circle with old lace background](https://huggingface.co/datasets/DavyMorgan/sd3-controlnet-results/resolve/main/step-6500.png) |

-### SD3.5
-
-#### After 500 steps with batch size 8
-
-| |                                                                                                                                                     |
-|-------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------:|
-||                                                   pale golden rod circle with old lace background                                                   |
- ![conditioning image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png) | ![pale golden rod circle with old lace background](https://huggingface.co/datasets/DavyMorgan/sd3-controlnet-results/resolve/main/step-500-3.5.png) |
-
-
-#### After 3000 steps with batch size 8:
-
-| |                                                                                                                                                      |
-|-------------------|:----------------------------------------------------------------------------------------------------------------------------------------------------:|
-||                                                   pale golden rod circle with old lace background                                                    |
- ![conditioning image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet_training/conditioning_image_1.png) | ![pale golden rod circle with old lace background](https://huggingface.co/datasets/DavyMorgan/sd3-controlnet-results/resolve/main/step-3000-3.5.png) |
-
@@ -138,27 +138,6 @@ class ControlNetSD3(ExamplesTestsAccelerate):
            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))


-class ControlNetSD35(ExamplesTestsAccelerate):
-    def test_controlnet_sd3(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-            examples/controlnet/train_controlnet_sd3.py
-            --pretrained_model_name_or_path=hf-internal-testing/tiny-sd35-pipe
-            --dataset_name=hf-internal-testing/fill10
-            --output_dir={tmpdir}
-            --resolution=64
-            --train_batch_size=1
-            --gradient_accumulation_steps=1
-            --controlnet_model_name_or_path=DavyMorgan/tiny-controlnet-sd35
-            --max_train_steps=4
-            --checkpointing_steps=2
-            """.split()
-
-            run_command(self._launch_args + test_args)
-
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "diffusion_pytorch_model.safetensors")))
-
-
 class ControlNetflux(ExamplesTestsAccelerate):
    def test_controlnet_flux(self):
        with tempfile.TemporaryDirectory() as tmpdir:
@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = logging.getLogger(__name__)

@@ -65,7 +65,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)
 if is_torch_npu_available():
@@ -59,7 +59,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -263,12 +263,6 @@ def parse_args(input_args=None):
        help="Path to pretrained controlnet model or model identifier from huggingface.co/models."
        " If not specified controlnet weights are initialized from unet.",
    )
-    parser.add_argument(
-        "--num_extra_conditioning_channels",
-        type=int,
-        default=0,
-        help="Number of extra conditioning channels for controlnet.",
-    )
    parser.add_argument(
        "--revision",
        type=str,
@@ -545,9 +539,6 @@ def parse_args(input_args=None):
        default=77,
        help="Maximum sequence length to use with with the T5 text encoder",
    )
-    parser.add_argument(
-        "--dataset_preprocess_batch_size", type=int, default=1000, help="Batch size for preprocessing dataset."
-    )
    parser.add_argument(
        "--validation_prompt",
        type=str,
@@ -995,9 +986,7 @@ def main(args):
        controlnet = SD3ControlNetModel.from_pretrained(args.controlnet_model_name_or_path)
    else:
        logger.info("Initializing controlnet weights from transformer")
-        controlnet = SD3ControlNetModel.from_transformer(
-            transformer, num_extra_conditioning_channels=args.num_extra_conditioning_channels
-        )
+        controlnet = SD3ControlNetModel.from_transformer(transformer)

    transformer.requires_grad_(False)
    vae.requires_grad_(False)
@@ -1134,12 +1123,7 @@ def main(args):
        # fingerprint used by the cache for the other processes to load the result
        # details: https://github.com/huggingface/diffusers/pull/4038#discussion_r1266078401
        new_fingerprint = Hasher.hash(args)
-        train_dataset = train_dataset.map(
-            compute_embeddings_fn,
-            batched=True,
-            batch_size=args.dataset_preprocess_batch_size,
-            new_fingerprint=new_fingerprint,
-        )
+        train_dataset = train_dataset.map(compute_embeddings_fn, batched=True, new_fingerprint=new_fingerprint)

    del text_encoder_one, text_encoder_two, text_encoder_three
    del tokenizer_one, tokenizer_two, tokenizer_three
@@ -61,7 +61,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)
 if is_torch_npu_available():
@@ -63,7 +63,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -1,127 +0,0 @@
-# DreamBooth training example for SANA
-
-[DreamBooth](https://arxiv.org/abs/2208.12242) is a method to personalize text2image models like stable diffusion given just a few (3~5) images of a subject.
-
-The `train_dreambooth_lora_sana.py` script shows how to implement the training procedure with [LoRA](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) and adapt it for [SANA](https://arxiv.org/abs/2410.10629). 
-
-
-This will also allow us to push the trained model parameters to the Hugging Face Hub platform.
-
-## Running locally with PyTorch
-
-### Installing the dependencies
-
-Before running the scripts, make sure to install the library's training dependencies:
-
-**Important**
-
-To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the install up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
-
-```bash
-git clone https://github.com/huggingface/diffusers
-cd diffusers
-pip install -e .
-```
-
-Then cd in the `examples/dreambooth` folder and run
-```bash
-pip install -r requirements_sana.txt
-```
-
-And initialize an [🤗Accelerate](https://github.com/huggingface/accelerate/) environment with:
-
-```bash
-accelerate config
-```
-
-Or for a default accelerate configuration without answering questions about your environment
-
-```bash
-accelerate config default
-```
-
-Or if your environment doesn't support an interactive shell (e.g., a notebook)
-
-```python
-from accelerate.utils import write_basic_config
-write_basic_config()
-```
-
-When running `accelerate config`, if we specify torch compile mode to True there can be dramatic speedups.
-Note also that we use PEFT library as backend for LoRA training, make sure to have `peft>=0.14.0` installed in your environment.
-
-
-### Dog toy example
-
-Now let's get our dataset. For this example we will use some dog images: https://huggingface.co/datasets/diffusers/dog-example.
-
-Let's first download it locally:
-
-```python
-from huggingface_hub import snapshot_download
-
-local_dir = "./dog"
-snapshot_download(
-    "diffusers/dog-example",
-    local_dir=local_dir, repo_type="dataset",
-    ignore_patterns=".gitattributes",
-)
-```
-
-This will also allow us to push the trained LoRA parameters to the Hugging Face Hub platform.
-
-Now, we can launch training using:
-
-```bash
-export MODEL_NAME="Efficient-Large-Model/Sana_1600M_1024px_BF16_diffusers"
-export INSTANCE_DIR="dog"
-export OUTPUT_DIR="trained-sana-lora"
-
-accelerate launch train_dreambooth_lora_sana.py \
-  --pretrained_model_name_or_path=$MODEL_NAME  \
-  --instance_data_dir=$INSTANCE_DIR \
-  --output_dir=$OUTPUT_DIR \
-  --mixed_precision="bf16" \
-  --instance_prompt="a photo of sks dog" \
-  --resolution=1024 \
-  --train_batch_size=1 \
-  --gradient_accumulation_steps=4 \
-  --use_8bit_adam \
-  --learning_rate=1e-4 \
-  --report_to="wandb" \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=500 \
-  --validation_prompt="A photo of sks dog in a bucket" \
-  --validation_epochs=25 \
-  --seed="0" \
-  --push_to_hub
-```
-
-For using `push_to_hub`, make you're logged into your Hugging Face account:
-
-```bash
-huggingface-cli login
-```
-
-To better track our training experiments, we're using the following flags in the command above:
-
-* `report_to="wandb` will ensure the training runs are tracked on [Weights and Biases](https://wandb.ai/site). To use it, be sure to install `wandb` with `pip install wandb`. Don't forget to call `wandb login <your_api_key>` before training if you haven't done it before.
-* `validation_prompt` and `validation_epochs` to allow the script to do a few validation inference runs. This allows us to qualitatively check if the training is progressing as expected.
-
-## Notes
-
-Additionally, we welcome you to explore the following CLI arguments:
-
-* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
-* `--complex_human_instruction`: Instructions for complex human attention as shown in [here](https://github.com/NVlabs/Sana/blob/main/configs/sana_app_config/Sana_1600M_app.yaml#L55).
-* `--max_sequence_length`: Maximum sequence length to use for text embeddings.
-
-
-We provide several options for optimizing memory optimization:
-
-* `--offload`: When enabled, we will offload the text encoder and VAE to CPU, when they are not used.
-* `cache_latents`: When enabled, we will pre-compute the latents from the input images with the VAE and remove the VAE from memory once done.
-* `--use_8bit_adam`: When enabled, we will use the 8bit version of AdamW provided by the `bitsandbytes` library.
-
-Refer to the [official documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/sana) of the `SanaPipeline` to know more about the models available under the SANA family and their preferred dtypes during inference.
@@ -1,8 +0,0 @@
-accelerate>=1.0.0
-torchvision
-transformers>=4.47.0
-ftfy
-tensorboard
-Jinja2
-peft>=0.14.0
-sentencepiece
@@ -1,206 +0,0 @@
-# coding=utf-8
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import logging
-import os
-import sys
-import tempfile
-
-import safetensors
-
-
-sys.path.append("..")
-from test_examples_utils import ExamplesTestsAccelerate, run_command  # noqa: E402
-
-
-logging.basicConfig(level=logging.DEBUG)
-
-logger = logging.getLogger()
-stream_handler = logging.StreamHandler(sys.stdout)
-logger.addHandler(stream_handler)
-
-
-class DreamBoothLoRASANA(ExamplesTestsAccelerate):
-    instance_data_dir = "docs/source/en/imgs"
-    pretrained_model_name_or_path = "hf-internal-testing/tiny-sana-pipe"
-    script_path = "examples/dreambooth/train_dreambooth_lora_sana.py"
-    transformer_layer_type = "transformer_blocks.0.attn1.to_k"
-
-    def test_dreambooth_lora_sana(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                {self.script_path}
-                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
-                --instance_data_dir {self.instance_data_dir}
-                --resolution 32
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --max_sequence_length 16
-                """.split()
-
-            test_args.extend(["--instance_prompt", ""])
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-            # when not training the text encoder, all the parameters in the state dict should start
-            # with `"transformer"` in their names.
-            starts_with_transformer = all(key.startswith("transformer") for key in lora_state_dict.keys())
-            self.assertTrue(starts_with_transformer)
-
-    def test_dreambooth_lora_latent_caching(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                {self.script_path}
-                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
-                --instance_data_dir {self.instance_data_dir}
-                --resolution 32
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --cache_latents
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --max_sequence_length 16
-                """.split()
-
-            test_args.extend(["--instance_prompt", ""])
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-            # when not training the text encoder, all the parameters in the state dict should start
-            # with `"transformer"` in their names.
-            starts_with_transformer = all(key.startswith("transformer") for key in lora_state_dict.keys())
-            self.assertTrue(starts_with_transformer)
-
-    def test_dreambooth_lora_layers(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                {self.script_path}
-                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
-                --instance_data_dir {self.instance_data_dir}
-                --resolution 32
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --cache_latents
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lora_layers {self.transformer_layer_type}
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                --max_sequence_length 16
-                """.split()
-
-            test_args.extend(["--instance_prompt", ""])
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-            # when not training the text encoder, all the parameters in the state dict should start
-            # with `"transformer"` in their names. In this test, we only params of
-            # `self.transformer_layer_type` should be in the state dict.
-            starts_with_transformer = all(self.transformer_layer_type in key for key in lora_state_dict)
-            self.assertTrue(starts_with_transformer)
-
-    def test_dreambooth_lora_sana_checkpointing_checkpoints_total_limit(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-            {self.script_path}
-            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
-            --instance_data_dir={self.instance_data_dir}
-            --output_dir={tmpdir}
-            --resolution=32
-            --train_batch_size=1
-            --gradient_accumulation_steps=1
-            --max_train_steps=6
-            --checkpoints_total_limit=2
-            --checkpointing_steps=2
-            --max_sequence_length 16
-            """.split()
-
-            test_args.extend(["--instance_prompt", ""])
-            run_command(self._launch_args + test_args)
-
-            self.assertEqual(
-                {x for x in os.listdir(tmpdir) if "checkpoint" in x},
-                {"checkpoint-4", "checkpoint-6"},
-            )
-
-    def test_dreambooth_lora_sana_checkpointing_checkpoints_total_limit_removes_multiple_checkpoints(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-            {self.script_path}
-            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
-            --instance_data_dir={self.instance_data_dir}
-            --output_dir={tmpdir}
-            --resolution=32
-            --train_batch_size=1
-            --gradient_accumulation_steps=1
-            --max_train_steps=4
-            --checkpointing_steps=2
-            --max_sequence_length 166
-            """.split()
-
-            test_args.extend(["--instance_prompt", ""])
-            run_command(self._launch_args + test_args)
-
-            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-2", "checkpoint-4"})
-
-            resume_run_args = f"""
-            {self.script_path}
-            --pretrained_model_name_or_path={self.pretrained_model_name_or_path}
-            --instance_data_dir={self.instance_data_dir}
-            --output_dir={tmpdir}
-            --resolution=32
-            --train_batch_size=1
-            --gradient_accumulation_steps=1
-            --max_train_steps=8
-            --checkpointing_steps=2
-            --resume_from_checkpoint=checkpoint-4
-            --checkpoints_total_limit=2
-            --max_sequence_length 16
-            """.split()
-
-            resume_run_args.extend(["--instance_prompt", ""])
-            run_command(self._launch_args + resume_run_args)
-
-            self.assertEqual({x for x in os.listdir(tmpdir) if "checkpoint" in x}, {"checkpoint-6", "checkpoint-8"})
@@ -63,7 +63,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -1300,17 +1300,16 @@ def main(args):
                    # Since we predict the noise instead of x_0, the original formulation is slightly changed.
                    # This is discussed in Section 4.2 of the same paper.
                    snr = compute_snr(noise_scheduler, timesteps)
+                    base_weight = (
+                        torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / snr
+                    )

                    if noise_scheduler.config.prediction_type == "v_prediction":
                        # Velocity objective needs to be floored to an SNR weight of one.
-                        divisor = snr + 1
+                        mse_loss_weights = base_weight + 1
                    else:
-                        divisor = snr
-
-                    mse_loss_weights = (
-                        torch.stack([snr, args.snr_gamma * torch.ones_like(timesteps)], dim=1).min(dim=1)[0] / divisor
-                    )
-
+                        # Epsilon and sample both use the same loss weights.
+                        mse_loss_weights = base_weight
                    loss = F.mse_loss(model_pred.float(), target.float(), reduction="none")
                    loss = loss.mean(dim=list(range(1, len(loss.shape)))) * mse_loss_weights
                    loss = loss.mean()
@@ -35,7 +35,7 @@ from diffusers.utils import check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 # Cache compiled models across invocations of this script.
 cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))
@@ -65,7 +65,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -70,7 +70,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -72,7 +72,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -72,7 +72,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -79,7 +79,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -63,7 +63,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -1,204 +0,0 @@
-# Training Flux Control
-
-This (experimental) example shows how to train Control LoRAs with [Flux](https://huggingface.co/black-forest-labs/FLUX.1-dev) by conditioning it with additional structural controls (like depth maps, poses, etc.). We provide a script for full fine-tuning, too, refer to [this section](#full-fine-tuning). To know more about Flux Control family, refer to the following resources:
-
-* [Docs](https://github.com/black-forest-labs/flux/blob/main/docs/structural-conditioning.md) by Black Forest Labs
-* Diffusers docs ([1](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux#canny-control), [2](https://huggingface.co/docs/diffusers/main/en/api/pipelines/flux#depth-control))
-
-To incorporate additional condition latents, we expand the input features of Flux.1-Dev from 64 to 128. The first 64 channels correspond to the original input latents to be denoised, while the latter 64 channels correspond to control latents. This expansion happens on the `x_embedder` layer, where the combined latents are projected to the expected feature dimension of rest of the network. Inference is performed using the `FluxControlPipeline`.
-
-> [!NOTE]
-> **Gated model**
->
-> As the model is gated, before using it with diffusers you first need to go to the [FLUX.1 [dev] Hugging Face page](https://huggingface.co/black-forest-labs/FLUX.1-dev), fill in the form and accept the gate. Once you are in, you need to log in so that your system knows you’ve accepted the gate. Use the command below to log in:
-
-```bash
-huggingface-cli login
-```
-
-The example command below shows how to launch fine-tuning for pose conditions. The dataset ([`raulc0399/open_pose_controlnet`](https://huggingface.co/datasets/raulc0399/open_pose_controlnet)) being used here already has the pose conditions of the original images, so we don't have to compute them.
-
-```bash
-accelerate launch train_control_lora_flux.py \
-  --pretrained_model_name_or_path="black-forest-labs/FLUX.1-dev" \
-  --dataset_name="raulc0399/open_pose_controlnet" \
-  --output_dir="pose-control-lora" \
-  --mixed_precision="bf16" \
-  --train_batch_size=1 \
-  --rank=64 \
-  --gradient_accumulation_steps=4 \
-  --gradient_checkpointing \
-  --use_8bit_adam \
-  --learning_rate=1e-4 \
-  --report_to="wandb" \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --max_train_steps=5000 \
-  --validation_image="openpose.png" \
-  --validation_prompt="A couple, 4k photo, highly detailed" \
-  --offload \
-  --seed="0" \
-  --push_to_hub
-```
-
-`openpose.png` comes from [here](https://huggingface.co/Adapter/t2iadapter/resolve/main/openpose.png).
-
-You need to install `diffusers` from the branch of [this PR](https://github.com/huggingface/diffusers/pull/9999). When it's merged, you should install `diffusers` from the `main`.
-
-The training script exposes additional CLI args that might be useful to experiment with:
-
-* `use_lora_bias`: When set, additionally trains the biases of the `lora_B` layer. 
-* `train_norm_layers`: When set, additionally trains the normalization scales. Takes care of saving and loading.
-* `lora_layers`: Specify the layers you want to apply LoRA to. If you specify "all-linear", all the linear layers will be LoRA-attached.
-
-### Training with DeepSpeed
-
-It's possible to train with [DeepSpeed](https://github.com/microsoft/DeepSpeed), specifically leveraging the Zero2 system optimization. To use it, save the following config to an YAML file (feel free to modify as needed):
-
-```yaml
-compute_environment: LOCAL_MACHINE
-debug: false
-deepspeed_config:
-  gradient_accumulation_steps: 1
-  gradient_clipping: 1.0
-  offload_optimizer_device: cpu
-  offload_param_device: cpu
-  zero3_init_flag: false
-  zero_stage: 2
-distributed_type: DEEPSPEED
-downcast_bf16: 'no'
-enable_cpu_affinity: false
-machine_rank: 0
-main_training_function: main
-mixed_precision: bf16
-num_machines: 1
-num_processes: 1
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
-```
-
-And then while launching training, pass the config file:
-
-```bash
-accelerate launch --config_file=CONFIG_FILE.yaml ...
-```
-
-### Inference
-
-The pose images in our dataset were computed using the [`controlnet_aux`](https://github.com/huggingface/controlnet_aux) library. Let's install it first:
-
-```bash
-pip install controlnet_aux
-```
-
-And then we are ready:
-
-```py
-from controlnet_aux import OpenposeDetector
-from diffusers import FluxControlPipeline
-from diffusers.utils import load_image
-from PIL import Image
-import numpy as np
-import torch 
-
-pipe = FluxControlPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16).to("cuda")
-pipe.load_lora_weights("...") # change this.
-
-open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
-
-# prepare pose condition.
-url = "https://huggingface.co/Adapter/t2iadapter/resolve/main/people.jpg"
-image = load_image(url)
-image = open_pose(image, detect_resolution=512, image_resolution=1024)
-image = np.array(image)[:, :, ::-1]           
-image = Image.fromarray(np.uint8(image))
-
-prompt = "A couple, 4k photo, highly detailed"
-
-gen_images = pipe(
-  prompt=prompt,
-  condition_image=image,
-  num_inference_steps=50,
-  joint_attention_kwargs={"scale": 0.9},
-  guidance_scale=25., 
-).images[0]
-gen_images.save("output.png")
-```
-
-## Full fine-tuning
-
-We provide a non-LoRA version of the training script `train_control_flux.py`. Here is an example command:
-
-```bash
-accelerate launch --config_file=accelerate_ds2.yaml train_control_flux.py \
-  --pretrained_model_name_or_path="black-forest-labs/FLUX.1-dev" \
-  --dataset_name="raulc0399/open_pose_controlnet" \
-  --output_dir="pose-control" \
-  --mixed_precision="bf16" \
-  --train_batch_size=2 \
-  --dataloader_num_workers=4 \
-  --gradient_accumulation_steps=4 \
-  --gradient_checkpointing \
-  --use_8bit_adam \
-  --proportion_empty_prompts=0.2 \
-  --learning_rate=5e-5 \
-  --adam_weight_decay=1e-4 \
-  --report_to="wandb" \
-  --lr_scheduler="cosine" \
-  --lr_warmup_steps=1000 \
-  --checkpointing_steps=1000 \
-  --max_train_steps=10000 \
-  --validation_steps=200 \
-  --validation_image "2_pose_1024.jpg" "3_pose_1024.jpg" \
-  --validation_prompt "two friends sitting by each other enjoying a day at the park, full hd, cinematic" "person enjoying a day at the park, full hd, cinematic" \
-  --offload \
-  --seed="0" \
-  --push_to_hub
-```
-
-Change the `validation_image` and `validation_prompt` as needed.
-
-For inference, this time, we will run:
-
-```py
-from controlnet_aux import OpenposeDetector
-from diffusers import FluxControlPipeline, FluxTransformer2DModel
-from diffusers.utils import load_image
-from PIL import Image
-import numpy as np
-import torch 
-
-transformer = FluxTransformer2DModel.from_pretrained("...") # change this.
-pipe = FluxControlPipeline.from_pretrained(
-  "black-forest-labs/FLUX.1-dev",  transformer=transformer, torch_dtype=torch.bfloat16
-).to("cuda")
-
-open_pose = OpenposeDetector.from_pretrained("lllyasviel/Annotators")
-
-# prepare pose condition.
-url = "https://huggingface.co/Adapter/t2iadapter/resolve/main/people.jpg"
-image = load_image(url)
-image = open_pose(image, detect_resolution=512, image_resolution=1024)
-image = np.array(image)[:, :, ::-1]           
-image = Image.fromarray(np.uint8(image))
-
-prompt = "A couple, 4k photo, highly detailed"
-
-gen_images = pipe(
-  prompt=prompt,
-  condition_image=image,
-  num_inference_steps=50,
-  guidance_scale=25., 
-).images[0]
-gen_images.save("output.png")
-```
-
-## Things to note
-
-* The scripts provided in this directory are experimental and educational. This means we may have to tweak things around to get good results on a given condition. We believe this is best done with the community 🤗
-* The scripts are not memory-optimized but we offload the VAE and the text encoders to CPU when they are not used. 
-* We can extract LoRAs from the fully fine-tuned model. While we currently don't provide any utilities for that, users are welcome to refer to [this script](https://github.com/Stability-AI/stability-ComfyUI-nodes/blob/master/control_lora_create.py) that provides a similar functionality. 
@@ -1,6 +0,0 @@
-transformers==4.47.0
-wandb
-torch
-torchvision
-accelerate==1.2.0
-peft>=0.14.0
@@ -57,7 +57,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -52,7 +52,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -46,7 +46,7 @@ from diffusers.utils import check_min_version, is_wandb_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -46,7 +46,7 @@ from diffusers.utils import check_min_version, is_wandb_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -51,7 +51,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -1,175 +0,0 @@
-# Search models on Civitai and Hugging Face
-
-The [auto_diffusers](https://github.com/suzukimain/auto_diffusers) library provides additional functionalities to Diffusers such as searching for models on Civitai and the Hugging Face Hub.
-Please refer to the original library [here](https://pypi.org/project/auto-diffusers/)
-
-## Installation
-
-Before running the scripts, make sure to install the library's training dependencies:
-
-> [!IMPORTANT]
-> To make sure you can successfully run the latest versions of the example scripts, we highly recommend **installing from source** and keeping the installation up to date as we update the example scripts frequently and install some example-specific requirements. To do this, execute the following steps in a new virtual environment.
-
-```bash
-git clone https://github.com/huggingface/diffusers
-cd diffusers
-pip install .
-```
-Set up the pipeline. You can also cd to this folder and run it.
-```bash
-!wget https://raw.githubusercontent.com/suzukimain/auto_diffusers/refs/heads/master/src/auto_diffusers/pipeline_easy.py
-```
-
-## Load from Civitai
-```python
-from pipeline_easy import (
-    EasyPipelineForText2Image,
-    EasyPipelineForImage2Image,
-    EasyPipelineForInpainting,
-)
-
-# Text-to-Image
-pipeline = EasyPipelineForText2Image.from_civitai(
-    "search_word",
-    base_model="SD 1.5",
-).to("cuda")
-
-
-# Image-to-Image
-pipeline = EasyPipelineForImage2Image.from_civitai(
-    "search_word",
-    base_model="SD 1.5",
-).to("cuda")
-
-
-# Inpainting
-pipeline = EasyPipelineForInpainting.from_civitai(
-    "search_word",
-    base_model="SD 1.5",
-).to("cuda")
-```
-
-## Load from Hugging Face
-```python
-from pipeline_easy import (
-    EasyPipelineForText2Image,
-    EasyPipelineForImage2Image,
-    EasyPipelineForInpainting,
-)
-
-# Text-to-Image
-pipeline = EasyPipelineForText2Image.from_huggingface(
-    "search_word",
-    checkpoint_format="diffusers",
-).to("cuda")
-
-
-# Image-to-Image
-pipeline = EasyPipelineForImage2Image.from_huggingface(
-    "search_word",
-    checkpoint_format="diffusers",
-).to("cuda")
-
-
-# Inpainting
-pipeline = EasyPipelineForInpainting.from_huggingface(
-    "search_word",
-    checkpoint_format="diffusers",
-).to("cuda")
-```
-
-
-## Search Civitai and Huggingface
-
-```python
-from pipeline_easy import (
-    search_huggingface,
-    search_civitai,
-) 
-
-# Search Lora
-Lora = search_civitai(
-    "Keyword_to_search_Lora",
-    model_type="LORA",
-    base_model = "SD 1.5",
-    download=True,
-    )
-# Load Lora into the pipeline.
-pipeline.load_lora_weights(Lora)
-
-
-# Search TextualInversion
-TextualInversion = search_civitai(
-    "EasyNegative",
-    model_type="TextualInversion",
-    base_model = "SD 1.5",
-    download=True
-)
-# Load TextualInversion into the pipeline.
-pipeline.load_textual_inversion(TextualInversion, token="EasyNegative")
-```
-
-### Search Civitai
-
-> [!TIP]
-> **If an error occurs, insert the `token` and run again.**
-
-#### `EasyPipeline.from_civitai` parameters
-
-| Name            | Type                   | Default       | Description                                                                    |
-|:---------------:|:----------------------:|:-------------:|:-----------------------------------------------------------------------------------:|
-| search_word     | string, Path           | ー            | The search query string. Can be a keyword, Civitai URL, local directory or file path. |
-| model_type      | string                 | `Checkpoint`  | The type of model to search for.  <br>(for example `Checkpoint`, `TextualInversion`, `Controlnet`, `LORA`, `Hypernetwork`, `AestheticGradient`, `Poses`)      |
-| base_model      | string                 | None          | Trained model tag (for example  `SD 1.5`, `SD 3.5`, `SDXL 1.0`) |
-| torch_dtype     | string, torch.dtype    | None          | Override the default `torch.dtype` and load the model with another dtype.     |
-| force_download  | bool                   | False         | Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. |
-| cache_dir       | string, Path | None    | Path to the folder where cached files are stored. |
-| resume          | bool   | False         | Whether to resume an incomplete download. |
-| token           | string | None          | API token for Civitai authentication. |
-
-
-#### `search_civitai` parameters
-
-| Name            | Type           | Default       | Description                                                                    |
-|:---------------:|:--------------:|:-------------:|:-----------------------------------------------------------------------------------:|
-| search_word     | string, Path   | ー            | The search query string. Can be a keyword, Civitai URL, local directory or file path. |
-| model_type      | string         | `Checkpoint`  | The type of model to search for. <br>(for example `Checkpoint`, `TextualInversion`, `Controlnet`, `LORA`, `Hypernetwork`, `AestheticGradient`, `Poses`)   |
-| base_model      | string         | None          | Trained model tag (for example  `SD 1.5`, `SD 3.5`, `SDXL 1.0`)                        |
-| download        | bool           | False         | Whether to download the model.                                   |
-| force_download  | bool           | False         | Whether to force the download if the model already exists.                          |
-| cache_dir       | string, Path   | None          | Path to the folder where cached files are stored.                              |
-| resume          | bool           | False         | Whether to resume an incomplete download.                                           |
-| token           | string         | None          | API token for Civitai authentication.                                               |
-| include_params  | bool           | False         | Whether to include parameters in the returned data.           |
-| skip_error      | bool           | False         | Whether to skip errors and return None.                                             |
-
-### Search Huggingface
-
-> [!TIP]
-> **If an error occurs, insert the `token` and run again.**
-
-#### `EasyPipeline.from_huggingface` parameters
-
-| Name                  | Type                | Default        | Description                                                      |
-|:---------------------:|:-------------------:|:--------------:|:----------------------------------------------------------------:|
-| search_word           | string, Path        | ー             | The search query string. Can be a keyword, Hugging Face URL, local directory or file path, or a Hugging Face path (`<creator>/<repo>`). |
-| checkpoint_format     | string              | `single_file`  | The format of the model checkpoint.<br>● `single_file` to search for `single file checkpoint` <br>●`diffusers` to search for `multifolder diffusers format checkpoint` |
-| torch_dtype           | string, torch.dtype | None           | Override the default `torch.dtype` and load the model with another dtype. |
-| force_download        | bool                | False          | Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. |
-| cache_dir             | string, Path        | None           | Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used.   |
-| token                 | string, bool        | None           | The token to use as HTTP bearer authorization for remote files.  |
-
-
-#### `search_huggingface` parameters
-
-| Name                  | Type                | Default        | Description                                                      |
-|:---------------------:|:-------------------:|:--------------:|:----------------------------------------------------------------:|
-| search_word           | string, Path        | ー             | The search query string. Can be a keyword, Hugging Face URL, local directory or file path, or a Hugging Face path (`<creator>/<repo>`). |
-| checkpoint_format     | string              | `single_file`  | The format of the model checkpoint. <br>● `single_file` to search for `single file checkpoint` <br>●`diffusers` to search for `multifolder diffusers format checkpoint` |
-| pipeline_tag          | string              | None           | Tag to filter models by pipeline.                                |
-| download              | bool                | False          | Whether to download the model.                                   |
-| force_download        | bool                | False          | Whether or not to force the (re-)download of the model weights and configuration files, overriding the cached versions if they exist. |
-| cache_dir             | string, Path        | None           | Path to a directory where a downloaded pretrained model configuration is cached if the standard cache is not used.   |
-| token                 | string, bool        | None           | The token to use as HTTP bearer authorization for remote files.  |
-| include_params        | bool                | False         | Whether to include parameters in the returned data.               |
-| skip_error            | bool                | False         | Whether to skip errors and return None.                           |
@@ -1 +0,0 @@
-huggingface-hub>=0.26.2
@@ -7,14 +7,13 @@ It has been tested on v4 and v5p TPU versions. Training code has been tested on
 This script implements Distributed Data Parallel using GSPMD feature in XLA compiler
 where we shard the input batches over the TPU devices. 

-As of 10-31-2024, these are some expected step times.
+As of 9-11-2024, these are some expected step times.

 | accelerator | global batch size | step time (seconds) |
 | ----------- | ----------------- | --------- |
-| v5p-512 | 16384 | 1.01 |
-| v5p-256 | 8192 | 1.01 |
-| v5p-128 | 4096 | 1.0 |
-| v5p-64 | 2048 | 1.01 |
+| v5p-128 | 1024 | 0.245 |
+| v5p-256 | 2048 | 0.234 |
+| v5p-512 | 4096 | 0.2498 |

 ## Create TPU

@@ -44,9 +43,8 @@ Install PyTorch and PyTorch/XLA nightly versions:
 gcloud compute tpus tpu-vm ssh ${TPU_NAME} \
 --project=${PROJECT_ID} --zone=${ZONE} --worker=all \
 --command='
-pip3 install --pre torch==2.6.0.dev20241031+cpu torchvision --index-url https://download.pytorch.org/whl/nightly/cpu
-pip3 install "torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.6.0.dev20241031.cxx11-cp310-cp310-linux_x86_64.whl" -f https://storage.googleapis.com/libtpu-releases/index.html
-pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
+pip3 install --pre torch==2.5.0.dev20240905+cpu torchvision==0.20.0.dev20240905+cpu --index-url https://download.pytorch.org/whl/nightly/cpu
+pip3 install "torch_xla[tpu] @ https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-2.5.0.dev20240905-cp310-cp310-linux_x86_64.whl" -f https://storage.googleapis.com/libtpu-releases/index.html
 '
 ```

@@ -90,18 +88,17 @@ are fixed.
 gcloud compute tpus tpu-vm ssh ${TPU_NAME} \
 --project=${PROJECT_ID} --zone=${ZONE} --worker=all \
 --command='
-export XLA_DISABLE_FUNCTIONALIZATION=0
+export XLA_DISABLE_FUNCTIONALIZATION=1 
 export PROFILE_DIR=/tmp/
 export CACHE_DIR=/tmp/
 export DATASET_NAME=lambdalabs/naruto-blip-captions
 export PER_HOST_BATCH_SIZE=32 # This is known to work on TPU v4. Can set this to 64 for TPU v5p
 export TRAIN_STEPS=50
 export OUTPUT_DIR=/tmp/trained-model/
-python diffusers/examples/research_projects/pytorch_xla/train_text_to_image_xla.py --pretrained_model_name_or_path=stabilityai/stable-diffusion-2-base --dataset_name=$DATASET_NAME --resolution=512 --center_crop --random_flip --train_batch_size=$PER_HOST_BATCH_SIZE  --max_train_steps=$TRAIN_STEPS --learning_rate=1e-06 --mixed_precision=bf16 --profile_duration=80000 --output_dir=$OUTPUT_DIR --dataloader_num_workers=8 --loader_prefetch_size=4 --device_prefetch_size=4'
+python diffusers/examples/research_projects/pytorch_xla/train_text_to_image_xla.py --pretrained_model_name_or_path=stabilityai/stable-diffusion-2-base --dataset_name=$DATASET_NAME --resolution=512 --center_crop --random_flip --train_batch_size=$PER_HOST_BATCH_SIZE  --max_train_steps=$TRAIN_STEPS --learning_rate=1e-06 --mixed_precision=bf16 --profile_duration=80000 --output_dir=$OUTPUT_DIR --dataloader_num_workers=4 --loader_prefetch_size=4 --device_prefetch_size=4'
+   
 ```

-Pass `--print_loss` if you would like to see the loss printed at every step. Be aware that printing the loss at every step disrupts the optimized flow execution, thus the step time will be longer. 
-
 ### Environment Envs Explained

 *   `XLA_DISABLE_FUNCTIONALIZATION`: To optimize the performance for AdamW optimizer.
@@ -140,43 +140,33 @@ class TrainSD:
        self.optimizer.step()

    def start_training(self):
-        dataloader_exception = False
-        measure_start_step = args.measure_start_step
-        assert measure_start_step < self.args.max_train_steps
-        total_time = 0
-        for step in range(0, self.args.max_train_steps):
+        times = []
+        last_time = time.time()
+        step = 0
+        while True:
+            if self.global_step >= self.args.max_train_steps:
+                xm.mark_step()
+                break
+            if step == 4 and PROFILE_DIR is not None:
+                xm.wait_device_ops()
+                xp.trace_detached(f"localhost:{PORT}", PROFILE_DIR, duration_ms=args.profile_duration)
            try:
                batch = next(self.dataloader)
            except Exception as e:
-                dataloader_exception = True
                print(e)
                break
-            if step == measure_start_step and PROFILE_DIR is not None:
-                xm.wait_device_ops()
-                xp.trace_detached(f"localhost:{PORT}", PROFILE_DIR, duration_ms=args.profile_duration)
-                last_time = time.time()
            loss = self.step_fn(batch["pixel_values"], batch["input_ids"])
+            step_time = time.time() - last_time
+            if step >= 10:
+                times.append(step_time)
+            print(f"step: {step}, step_time: {step_time}")
+            if step % 5 == 0:
+                print(f"step: {step}, loss: {loss}")
+            last_time = time.time()
            self.global_step += 1
-
-            def print_loss_closure(step, loss):
-                print(f"Step: {step}, Loss: {loss}")
-
-            if args.print_loss:
-                xm.add_step_closure(
-                    print_loss_closure,
-                    args=(
-                        self.global_step,
-                        loss,
-                    ),
-                )
-        xm.mark_step()
-        if not dataloader_exception:
-            xm.wait_device_ops()
-            total_time = time.time() - last_time
-            print(f"Average step time: {total_time/(self.args.max_train_steps-measure_start_step)}")
-        else:
-            print("dataloader exception happen, skip result")
-            return
+            step += 1
+        # print(f"Average step time: {sum(times)/len(times)}")
+        xm.wait_device_ops()

    def step_fn(
        self,
@@ -190,10 +180,7 @@ class TrainSD:
            noise = torch.randn_like(latents).to(self.device, dtype=self.weight_dtype)
            bsz = latents.shape[0]
            timesteps = torch.randint(
-                0,
-                self.noise_scheduler.config.num_train_timesteps,
-                (bsz,),
-                device=latents.device,
+                0, self.noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device
            )
            timesteps = timesteps.long()

@@ -237,6 +224,9 @@ class TrainSD:

 def parse_args():
    parser = argparse.ArgumentParser(description="Simple example of a training script.")
+    parser.add_argument(
+        "--input_perturbation", type=float, default=0, help="The scale of input perturbation. Recommended 0.1."
+    )
    parser.add_argument("--profile_duration", type=int, default=10000, help="Profile duration in ms")
    parser.add_argument(
        "--pretrained_model_name_or_path",
@@ -268,6 +258,12 @@ def parse_args():
            " or to a folder containing files that 🤗 Datasets can understand."
        ),
    )
+    parser.add_argument(
+        "--dataset_config_name",
+        type=str,
+        default=None,
+        help="The config of the Dataset, leave as None if there's only one config.",
+    )
    parser.add_argument(
        "--train_data_dir",
        type=str,
@@ -287,6 +283,15 @@ def parse_args():
        default="text",
        help="The column of the dataset containing a caption or a list of captions.",
    )
+    parser.add_argument(
+        "--max_train_samples",
+        type=int,
+        default=None,
+        help=(
+            "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        ),
+    )
    parser.add_argument(
        "--output_dir",
        type=str,
@@ -299,6 +304,7 @@ def parse_args():
        default=None,
        help="The directory where the downloaded models and datasets will be stored.",
    )
+    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
    parser.add_argument(
        "--resolution",
        type=int,
@@ -368,19 +374,12 @@ def parse_args():
        default=1,
        help=("Number of subprocesses to use for data loading to cpu."),
    )
-    parser.add_argument(
-        "--loader_prefetch_factor",
-        type=int,
-        default=2,
-        help=("Number of batches loaded in advance by each worker."),
-    )
    parser.add_argument(
        "--device_prefetch_size",
        type=int,
        default=1,
        help=("Number of subprocesses to use for data loading to tpu from cpu. "),
    )
-    parser.add_argument("--measure_start_step", type=int, default=10, help="Step to start profiling.")
    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
@@ -395,8 +394,12 @@ def parse_args():
        "--mixed_precision",
        type=str,
        default=None,
-        choices=["no", "bf16"],
-        help=("Whether to use mixed precision. Bf16 requires PyTorch >= 1.10"),
+        choices=["no", "fp16", "bf16"],
+        help=(
+            "Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
+            " 1.10.and an Nvidia Ampere GPU.  Default to the value of accelerate config of the current system or the"
+            " flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
+        ),
    )
    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
@@ -406,12 +409,6 @@ def parse_args():
        default=None,
        help="The name of the repository to keep in sync with the local `output_dir`.",
    )
-    parser.add_argument(
-        "--print_loss",
-        default=False,
-        action="store_true",
-        help=("Print loss at every step."),
-    )

    args = parser.parse_args()

@@ -439,6 +436,7 @@ def load_dataset(args):
        # Downloading and loading a dataset from the hub.
        dataset = datasets.load_dataset(
            args.dataset_name,
+            args.dataset_config_name,
            cache_dir=args.cache_dir,
            data_dir=args.train_data_dir,
        )
@@ -483,7 +481,9 @@ def main(args):
    _ = xp.start_server(PORT)

    num_devices = xr.global_runtime_device_count()
-    mesh = xs.get_1d_mesh("data")
+    device_ids = np.arange(num_devices)
+    mesh_shape = (num_devices, 1)
+    mesh = xs.Mesh(device_ids, mesh_shape, ("x", "y"))
    xs.set_global_mesh(mesh)

    text_encoder = CLIPTextModel.from_pretrained(
@@ -520,7 +520,6 @@ def main(args):
    from torch_xla.distributed.fsdp.utils import apply_xla_patch_to_nn_linear

    unet = apply_xla_patch_to_nn_linear(unet, xs.xla_patched_nn_linear_forward)
-    unet.enable_xla_flash_attention(partition_spec=("data", None, None, None))

    vae.requires_grad_(False)
    text_encoder.requires_grad_(False)
@@ -531,12 +530,15 @@ def main(args):
    # as these weights are only used for inference, keeping weights in full
    # precision is not required.
    weight_dtype = torch.float32
-    if args.mixed_precision == "bf16":
+    if args.mixed_precision == "fp16":
+        weight_dtype = torch.float16
+    elif args.mixed_precision == "bf16":
        weight_dtype = torch.bfloat16

    device = xm.xla_device()
+    print("device: ", device)
+    print("weight_dtype: ", weight_dtype)

-    # Move text_encode and vae to device and cast to weight_dtype
    text_encoder = text_encoder.to(device, dtype=weight_dtype)
    vae = vae.to(device, dtype=weight_dtype)
    unet = unet.to(device, dtype=weight_dtype)
@@ -604,27 +606,24 @@ def main(args):
        collate_fn=collate_fn,
        num_workers=args.dataloader_num_workers,
        batch_size=args.train_batch_size,
-        prefetch_factor=args.loader_prefetch_factor,
    )

    train_dataloader = pl.MpDeviceLoader(
        train_dataloader,
        device,
        input_sharding={
-            "pixel_values": xs.ShardingSpec(mesh, ("data", None, None, None), minibatch=True),
-            "input_ids": xs.ShardingSpec(mesh, ("data", None), minibatch=True),
+            "pixel_values": xs.ShardingSpec(mesh, ("x", None, None, None), minibatch=True),
+            "input_ids": xs.ShardingSpec(mesh, ("x", None), minibatch=True),
        },
        loader_prefetch_size=args.loader_prefetch_size,
        device_prefetch_size=args.device_prefetch_size,
    )

-    num_hosts = xr.process_count()
-    num_devices_per_host = num_devices // num_hosts
    if xm.is_master_ordinal():
        print("***** Running training *****")
-        print(f"Instantaneous batch size per device = {args.train_batch_size // num_devices_per_host }")
+        print(f"Instantaneous batch size per device = {args.train_batch_size}")
        print(
-            f"Total train batch size (w. parallel, distributed & accumulation) = {args.train_batch_size * num_hosts}"
+            f"Total train batch size (w. parallel, distributed & accumulation) = {args.train_batch_size * num_devices}"
        )
        print(f"  Total optimization steps = {args.max_train_steps}")

@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -57,7 +57,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -49,7 +49,7 @@ from diffusers.utils import check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = logging.getLogger(__name__)

@@ -56,7 +56,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -68,7 +68,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)
 if is_torch_npu_available():
@@ -55,7 +55,7 @@ from diffusers.utils.torch_utils import is_compiled_module


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)
 if is_torch_npu_available():
@@ -81,7 +81,7 @@ else:


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -56,7 +56,7 @@ else:
 # ------------------------------------------------------------------------------

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = logging.getLogger(__name__)

@@ -76,7 +76,7 @@ else:


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__)

@@ -29,7 +29,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -50,7 +50,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0")
+check_min_version("0.32.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -1,323 +0,0 @@
-import argparse
-from typing import Any, Dict
-
-import torch
-from huggingface_hub import hf_hub_download
-from safetensors.torch import load_file
-
-from diffusers import AutoencoderDC
-
-
-def remap_qkv_(key: str, state_dict: Dict[str, Any]):
-    qkv = state_dict.pop(key)
-    q, k, v = torch.chunk(qkv, 3, dim=0)
-    parent_module, _, _ = key.rpartition(".qkv.conv.weight")
-    state_dict[f"{parent_module}.to_q.weight"] = q.squeeze()
-    state_dict[f"{parent_module}.to_k.weight"] = k.squeeze()
-    state_dict[f"{parent_module}.to_v.weight"] = v.squeeze()
-
-
-def remap_proj_conv_(key: str, state_dict: Dict[str, Any]):
-    parent_module, _, _ = key.rpartition(".proj.conv.weight")
-    state_dict[f"{parent_module}.to_out.weight"] = state_dict.pop(key).squeeze()
-
-
-AE_KEYS_RENAME_DICT = {
-    # common
-    "main.": "",
-    "op_list.": "",
-    "context_module": "attn",
-    "local_module": "conv_out",
-    # NOTE: The below two lines work because scales in the available configs only have a tuple length of 1
-    # If there were more scales, there would be more layers, so a loop would be better to handle this
-    "aggreg.0.0": "to_qkv_multiscale.0.proj_in",
-    "aggreg.0.1": "to_qkv_multiscale.0.proj_out",
-    "depth_conv.conv": "conv_depth",
-    "inverted_conv.conv": "conv_inverted",
-    "point_conv.conv": "conv_point",
-    "point_conv.norm": "norm",
-    "conv.conv.": "conv.",
-    "conv1.conv": "conv1",
-    "conv2.conv": "conv2",
-    "conv2.norm": "norm",
-    "proj.norm": "norm_out",
-    # encoder
-    "encoder.project_in.conv": "encoder.conv_in",
-    "encoder.project_out.0.conv": "encoder.conv_out",
-    "encoder.stages": "encoder.down_blocks",
-    # decoder
-    "decoder.project_in.conv": "decoder.conv_in",
-    "decoder.project_out.0": "decoder.norm_out",
-    "decoder.project_out.2.conv": "decoder.conv_out",
-    "decoder.stages": "decoder.up_blocks",
-}
-
-AE_F32C32_KEYS = {
-    # encoder
-    "encoder.project_in.conv": "encoder.conv_in.conv",
-    # decoder
-    "decoder.project_out.2.conv": "decoder.conv_out.conv",
-}
-
-AE_F64C128_KEYS = {
-    # encoder
-    "encoder.project_in.conv": "encoder.conv_in.conv",
-    # decoder
-    "decoder.project_out.2.conv": "decoder.conv_out.conv",
-}
-
-AE_F128C512_KEYS = {
-    # encoder
-    "encoder.project_in.conv": "encoder.conv_in.conv",
-    # decoder
-    "decoder.project_out.2.conv": "decoder.conv_out.conv",
-}
-
-AE_SPECIAL_KEYS_REMAP = {
-    "qkv.conv.weight": remap_qkv_,
-    "proj.conv.weight": remap_proj_conv_,
-}
-
-
-def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
-    state_dict = saved_dict
-    if "model" in saved_dict.keys():
-        state_dict = state_dict["model"]
-    if "module" in saved_dict.keys():
-        state_dict = state_dict["module"]
-    if "state_dict" in saved_dict.keys():
-        state_dict = state_dict["state_dict"]
-    return state_dict
-
-
-def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
-    state_dict[new_key] = state_dict.pop(old_key)
-
-
-def convert_ae(config_name: str, dtype: torch.dtype):
-    config = get_ae_config(config_name)
-    hub_id = f"mit-han-lab/{config_name}"
-    ckpt_path = hf_hub_download(hub_id, "model.safetensors")
-    original_state_dict = get_state_dict(load_file(ckpt_path))
-
-    ae = AutoencoderDC(**config).to(dtype=dtype)
-
-    for key in list(original_state_dict.keys()):
-        new_key = key[:]
-        for replace_key, rename_key in AE_KEYS_RENAME_DICT.items():
-            new_key = new_key.replace(replace_key, rename_key)
-        update_state_dict_(original_state_dict, key, new_key)
-
-    for key in list(original_state_dict.keys()):
-        for special_key, handler_fn_inplace in AE_SPECIAL_KEYS_REMAP.items():
-            if special_key not in key:
-                continue
-            handler_fn_inplace(key, original_state_dict)
-
-    ae.load_state_dict(original_state_dict, strict=True)
-    return ae
-
-
-def get_ae_config(name: str):
-    if name in ["dc-ae-f32c32-sana-1.0"]:
-        config = {
-            "latent_channels": 32,
-            "encoder_block_types": (
-                "ResBlock",
-                "ResBlock",
-                "ResBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-            ),
-            "decoder_block_types": (
-                "ResBlock",
-                "ResBlock",
-                "ResBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-            ),
-            "encoder_block_out_channels": (128, 256, 512, 512, 1024, 1024),
-            "decoder_block_out_channels": (128, 256, 512, 512, 1024, 1024),
-            "encoder_qkv_multiscales": ((), (), (), (5,), (5,), (5,)),
-            "decoder_qkv_multiscales": ((), (), (), (5,), (5,), (5,)),
-            "encoder_layers_per_block": (2, 2, 2, 3, 3, 3),
-            "decoder_layers_per_block": [3, 3, 3, 3, 3, 3],
-            "downsample_block_type": "conv",
-            "upsample_block_type": "interpolate",
-            "decoder_norm_types": "rms_norm",
-            "decoder_act_fns": "silu",
-            "scaling_factor": 0.41407,
-        }
-    elif name in ["dc-ae-f32c32-in-1.0", "dc-ae-f32c32-mix-1.0"]:
-        AE_KEYS_RENAME_DICT.update(AE_F32C32_KEYS)
-        config = {
-            "latent_channels": 32,
-            "encoder_block_types": [
-                "ResBlock",
-                "ResBlock",
-                "ResBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-            ],
-            "decoder_block_types": [
-                "ResBlock",
-                "ResBlock",
-                "ResBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-            ],
-            "encoder_block_out_channels": [128, 256, 512, 512, 1024, 1024],
-            "decoder_block_out_channels": [128, 256, 512, 512, 1024, 1024],
-            "encoder_layers_per_block": [0, 4, 8, 2, 2, 2],
-            "decoder_layers_per_block": [0, 5, 10, 2, 2, 2],
-            "encoder_qkv_multiscales": ((), (), (), (), (), ()),
-            "decoder_qkv_multiscales": ((), (), (), (), (), ()),
-            "decoder_norm_types": ["batch_norm", "batch_norm", "batch_norm", "rms_norm", "rms_norm", "rms_norm"],
-            "decoder_act_fns": ["relu", "relu", "relu", "silu", "silu", "silu"],
-        }
-        if name == "dc-ae-f32c32-in-1.0":
-            config["scaling_factor"] = 0.3189
-        elif name == "dc-ae-f32c32-mix-1.0":
-            config["scaling_factor"] = 0.4552
-    elif name in ["dc-ae-f64c128-in-1.0", "dc-ae-f64c128-mix-1.0"]:
-        AE_KEYS_RENAME_DICT.update(AE_F64C128_KEYS)
-        config = {
-            "latent_channels": 128,
-            "encoder_block_types": [
-                "ResBlock",
-                "ResBlock",
-                "ResBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-            ],
-            "decoder_block_types": [
-                "ResBlock",
-                "ResBlock",
-                "ResBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-            ],
-            "encoder_block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048],
-            "decoder_block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048],
-            "encoder_layers_per_block": [0, 4, 8, 2, 2, 2, 2],
-            "decoder_layers_per_block": [0, 5, 10, 2, 2, 2, 2],
-            "encoder_qkv_multiscales": ((), (), (), (), (), (), ()),
-            "decoder_qkv_multiscales": ((), (), (), (), (), (), ()),
-            "decoder_norm_types": [
-                "batch_norm",
-                "batch_norm",
-                "batch_norm",
-                "rms_norm",
-                "rms_norm",
-                "rms_norm",
-                "rms_norm",
-            ],
-            "decoder_act_fns": ["relu", "relu", "relu", "silu", "silu", "silu", "silu"],
-        }
-        if name == "dc-ae-f64c128-in-1.0":
-            config["scaling_factor"] = 0.2889
-        elif name == "dc-ae-f64c128-mix-1.0":
-            config["scaling_factor"] = 0.4538
-    elif name in ["dc-ae-f128c512-in-1.0", "dc-ae-f128c512-mix-1.0"]:
-        AE_KEYS_RENAME_DICT.update(AE_F128C512_KEYS)
-        config = {
-            "latent_channels": 512,
-            "encoder_block_types": [
-                "ResBlock",
-                "ResBlock",
-                "ResBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-            ],
-            "decoder_block_types": [
-                "ResBlock",
-                "ResBlock",
-                "ResBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-                "EfficientViTBlock",
-            ],
-            "encoder_block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048, 2048],
-            "decoder_block_out_channels": [128, 256, 512, 512, 1024, 1024, 2048, 2048],
-            "encoder_layers_per_block": [0, 4, 8, 2, 2, 2, 2, 2],
-            "decoder_layers_per_block": [0, 5, 10, 2, 2, 2, 2, 2],
-            "encoder_qkv_multiscales": ((), (), (), (), (), (), (), ()),
-            "decoder_qkv_multiscales": ((), (), (), (), (), (), (), ()),
-            "decoder_norm_types": [
-                "batch_norm",
-                "batch_norm",
-                "batch_norm",
-                "rms_norm",
-                "rms_norm",
-                "rms_norm",
-                "rms_norm",
-                "rms_norm",
-            ],
-            "decoder_act_fns": ["relu", "relu", "relu", "silu", "silu", "silu", "silu", "silu"],
-        }
-        if name == "dc-ae-f128c512-in-1.0":
-            config["scaling_factor"] = 0.4883
-        elif name == "dc-ae-f128c512-mix-1.0":
-            config["scaling_factor"] = 0.3620
-    else:
-        raise ValueError("Invalid config name provided.")
-
-    return config
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--config_name",
-        type=str,
-        default="dc-ae-f32c32-sana-1.0",
-        choices=[
-            "dc-ae-f32c32-sana-1.0",
-            "dc-ae-f32c32-in-1.0",
-            "dc-ae-f32c32-mix-1.0",
-            "dc-ae-f64c128-in-1.0",
-            "dc-ae-f64c128-mix-1.0",
-            "dc-ae-f128c512-in-1.0",
-            "dc-ae-f128c512-mix-1.0",
-        ],
-        help="The DCAE checkpoint to convert",
-    )
-    parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
-    parser.add_argument("--dtype", default="fp32", help="Torch dtype to save the model in.")
-    return parser.parse_args()
-
-
-DTYPE_MAPPING = {
-    "fp32": torch.float32,
-    "fp16": torch.float16,
-    "bf16": torch.bfloat16,
-}
-
-VARIANT_MAPPING = {
-    "fp32": None,
-    "fp16": "fp16",
-    "bf16": "bf16",
-}
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    dtype = DTYPE_MAPPING[args.dtype]
-    variant = VARIANT_MAPPING[args.dtype]
-
-    ae = convert_ae(args.config_name, dtype)
-    ae.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB", variant=variant)
@@ -1,97 +0,0 @@
-import argparse
-from contextlib import nullcontext
-
-import safetensors.torch
-from accelerate import init_empty_weights
-from huggingface_hub import hf_hub_download
-
-from diffusers.utils.import_utils import is_accelerate_available, is_transformers_available
-
-
-if is_transformers_available():
-    from transformers import CLIPVisionModelWithProjection
-
-    vision = True
-else:
-    vision = False
-
-"""
-python scripts/convert_flux_xlabs_ipadapter_to_diffusers.py  \
--original_state_dict_repo_id "XLabs-AI/flux-ip-adapter" \
--filename "flux-ip-adapter.safetensors"
--output_path "flux-ip-adapter-hf/"
-"""
-
-
-CTX = init_empty_weights if is_accelerate_available else nullcontext
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--original_state_dict_repo_id", default=None, type=str)
-parser.add_argument("--filename", default="flux.safetensors", type=str)
-parser.add_argument("--checkpoint_path", default=None, type=str)
-parser.add_argument("--output_path", type=str)
-parser.add_argument("--vision_pretrained_or_path", default="openai/clip-vit-large-patch14", type=str)
-
-args = parser.parse_args()
-
-
-def load_original_checkpoint(args):
-    if args.original_state_dict_repo_id is not None:
-        ckpt_path = hf_hub_download(repo_id=args.original_state_dict_repo_id, filename=args.filename)
-    elif args.checkpoint_path is not None:
-        ckpt_path = args.checkpoint_path
-    else:
-        raise ValueError(" please provide either `original_state_dict_repo_id` or a local `checkpoint_path`")
-
-    original_state_dict = safetensors.torch.load_file(ckpt_path)
-    return original_state_dict
-
-
-def convert_flux_ipadapter_checkpoint_to_diffusers(original_state_dict, num_layers):
-    converted_state_dict = {}
-
-    # image_proj
-    ## norm
-    converted_state_dict["image_proj.norm.weight"] = original_state_dict.pop("ip_adapter_proj_model.norm.weight")
-    converted_state_dict["image_proj.norm.bias"] = original_state_dict.pop("ip_adapter_proj_model.norm.bias")
-    ## proj
-    converted_state_dict["image_proj.proj.weight"] = original_state_dict.pop("ip_adapter_proj_model.norm.weight")
-    converted_state_dict["image_proj.proj.bias"] = original_state_dict.pop("ip_adapter_proj_model.norm.bias")
-
-    # double transformer blocks
-    for i in range(num_layers):
-        block_prefix = f"ip_adapter.{i}."
-        # to_k_ip
-        converted_state_dict[f"{block_prefix}to_k_ip.bias"] = original_state_dict.pop(
-            f"double_blocks.{i}.processor.ip_adapter_double_stream_k_proj.bias"
-        )
-        converted_state_dict[f"{block_prefix}to_k_ip.weight"] = original_state_dict.pop(
-            f"double_blocks.{i}.processor.ip_adapter_double_stream_k_proj.weight"
-        )
-        # to_v_ip
-        converted_state_dict[f"{block_prefix}to_v_ip.bias"] = original_state_dict.pop(
-            f"double_blocks.{i}.processor.ip_adapter_double_stream_v_proj.bias"
-        )
-        converted_state_dict[f"{block_prefix}to_k_ip.weight"] = original_state_dict.pop(
-            f"double_blocks.{i}.processor.ip_adapter_double_stream_v_proj.weight"
-        )
-
-    return converted_state_dict
-
-
-def main(args):
-    original_ckpt = load_original_checkpoint(args)
-
-    num_layers = 19
-    converted_ip_adapter_state_dict = convert_flux_ipadapter_checkpoint_to_diffusers(original_ckpt, num_layers)
-
-    print("Saving Flux IP-Adapter in Diffusers format.")
-    safetensors.torch.save_file(converted_ip_adapter_state_dict, f"{args.output_path}/model.safetensors")
-
-    if vision:
-        model = CLIPVisionModelWithProjection.from_pretrained(args.vision_pretrained_or_path)
-        model.save_pretrained(f"{args.output_path}/image_encoder")
-
-
-if __name__ == "__main__":
-    main(args)
@@ -1,257 +0,0 @@
-import argparse
-from typing import Any, Dict
-
-import torch
-from accelerate import init_empty_weights
-from transformers import AutoModel, AutoTokenizer, CLIPTextModel, CLIPTokenizer
-
-from diffusers import (
-    AutoencoderKLHunyuanVideo,
-    FlowMatchEulerDiscreteScheduler,
-    HunyuanVideoPipeline,
-    HunyuanVideoTransformer3DModel,
-)
-
-
-def remap_norm_scale_shift_(key, state_dict):
-    weight = state_dict.pop(key)
-    shift, scale = weight.chunk(2, dim=0)
-    new_weight = torch.cat([scale, shift], dim=0)
-    state_dict[key.replace("final_layer.adaLN_modulation.1", "norm_out.linear")] = new_weight
-
-
-def remap_txt_in_(key, state_dict):
-    def rename_key(key):
-        new_key = key.replace("individual_token_refiner.blocks", "token_refiner.refiner_blocks")
-        new_key = new_key.replace("adaLN_modulation.1", "norm_out.linear")
-        new_key = new_key.replace("txt_in", "context_embedder")
-        new_key = new_key.replace("t_embedder.mlp.0", "time_text_embed.timestep_embedder.linear_1")
-        new_key = new_key.replace("t_embedder.mlp.2", "time_text_embed.timestep_embedder.linear_2")
-        new_key = new_key.replace("c_embedder", "time_text_embed.text_embedder")
-        new_key = new_key.replace("mlp", "ff")
-        return new_key
-
-    if "self_attn_qkv" in key:
-        weight = state_dict.pop(key)
-        to_q, to_k, to_v = weight.chunk(3, dim=0)
-        state_dict[rename_key(key.replace("self_attn_qkv", "attn.to_q"))] = to_q
-        state_dict[rename_key(key.replace("self_attn_qkv", "attn.to_k"))] = to_k
-        state_dict[rename_key(key.replace("self_attn_qkv", "attn.to_v"))] = to_v
-    else:
-        state_dict[rename_key(key)] = state_dict.pop(key)
-
-
-def remap_img_attn_qkv_(key, state_dict):
-    weight = state_dict.pop(key)
-    to_q, to_k, to_v = weight.chunk(3, dim=0)
-    state_dict[key.replace("img_attn_qkv", "attn.to_q")] = to_q
-    state_dict[key.replace("img_attn_qkv", "attn.to_k")] = to_k
-    state_dict[key.replace("img_attn_qkv", "attn.to_v")] = to_v
-
-
-def remap_txt_attn_qkv_(key, state_dict):
-    weight = state_dict.pop(key)
-    to_q, to_k, to_v = weight.chunk(3, dim=0)
-    state_dict[key.replace("txt_attn_qkv", "attn.add_q_proj")] = to_q
-    state_dict[key.replace("txt_attn_qkv", "attn.add_k_proj")] = to_k
-    state_dict[key.replace("txt_attn_qkv", "attn.add_v_proj")] = to_v
-
-
-def remap_single_transformer_blocks_(key, state_dict):
-    hidden_size = 3072
-
-    if "linear1.weight" in key:
-        linear1_weight = state_dict.pop(key)
-        split_size = (hidden_size, hidden_size, hidden_size, linear1_weight.size(0) - 3 * hidden_size)
-        q, k, v, mlp = torch.split(linear1_weight, split_size, dim=0)
-        new_key = key.replace("single_blocks", "single_transformer_blocks").removesuffix(".linear1.weight")
-        state_dict[f"{new_key}.attn.to_q.weight"] = q
-        state_dict[f"{new_key}.attn.to_k.weight"] = k
-        state_dict[f"{new_key}.attn.to_v.weight"] = v
-        state_dict[f"{new_key}.proj_mlp.weight"] = mlp
-
-    elif "linear1.bias" in key:
-        linear1_bias = state_dict.pop(key)
-        split_size = (hidden_size, hidden_size, hidden_size, linear1_bias.size(0) - 3 * hidden_size)
-        q_bias, k_bias, v_bias, mlp_bias = torch.split(linear1_bias, split_size, dim=0)
-        new_key = key.replace("single_blocks", "single_transformer_blocks").removesuffix(".linear1.bias")
-        state_dict[f"{new_key}.attn.to_q.bias"] = q_bias
-        state_dict[f"{new_key}.attn.to_k.bias"] = k_bias
-        state_dict[f"{new_key}.attn.to_v.bias"] = v_bias
-        state_dict[f"{new_key}.proj_mlp.bias"] = mlp_bias
-
-    else:
-        new_key = key.replace("single_blocks", "single_transformer_blocks")
-        new_key = new_key.replace("linear2", "proj_out")
-        new_key = new_key.replace("q_norm", "attn.norm_q")
-        new_key = new_key.replace("k_norm", "attn.norm_k")
-        state_dict[new_key] = state_dict.pop(key)
-
-
-TRANSFORMER_KEYS_RENAME_DICT = {
-    "img_in": "x_embedder",
-    "time_in.mlp.0": "time_text_embed.timestep_embedder.linear_1",
-    "time_in.mlp.2": "time_text_embed.timestep_embedder.linear_2",
-    "guidance_in.mlp.0": "time_text_embed.guidance_embedder.linear_1",
-    "guidance_in.mlp.2": "time_text_embed.guidance_embedder.linear_2",
-    "vector_in.in_layer": "time_text_embed.text_embedder.linear_1",
-    "vector_in.out_layer": "time_text_embed.text_embedder.linear_2",
-    "double_blocks": "transformer_blocks",
-    "img_attn_q_norm": "attn.norm_q",
-    "img_attn_k_norm": "attn.norm_k",
-    "img_attn_proj": "attn.to_out.0",
-    "txt_attn_q_norm": "attn.norm_added_q",
-    "txt_attn_k_norm": "attn.norm_added_k",
-    "txt_attn_proj": "attn.to_add_out",
-    "img_mod.linear": "norm1.linear",
-    "img_norm1": "norm1.norm",
-    "img_norm2": "norm2",
-    "img_mlp": "ff",
-    "txt_mod.linear": "norm1_context.linear",
-    "txt_norm1": "norm1.norm",
-    "txt_norm2": "norm2_context",
-    "txt_mlp": "ff_context",
-    "self_attn_proj": "attn.to_out.0",
-    "modulation.linear": "norm.linear",
-    "pre_norm": "norm.norm",
-    "final_layer.norm_final": "norm_out.norm",
-    "final_layer.linear": "proj_out",
-    "fc1": "net.0.proj",
-    "fc2": "net.2",
-    "input_embedder": "proj_in",
-}
-
-TRANSFORMER_SPECIAL_KEYS_REMAP = {
-    "txt_in": remap_txt_in_,
-    "img_attn_qkv": remap_img_attn_qkv_,
-    "txt_attn_qkv": remap_txt_attn_qkv_,
-    "single_blocks": remap_single_transformer_blocks_,
-    "final_layer.adaLN_modulation.1": remap_norm_scale_shift_,
-}
-
-VAE_KEYS_RENAME_DICT = {}
-
-VAE_SPECIAL_KEYS_REMAP = {}
-
-
-def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
-    state_dict[new_key] = state_dict.pop(old_key)
-
-
-def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
-    state_dict = saved_dict
-    if "model" in saved_dict.keys():
-        state_dict = state_dict["model"]
-    if "module" in saved_dict.keys():
-        state_dict = state_dict["module"]
-    if "state_dict" in saved_dict.keys():
-        state_dict = state_dict["state_dict"]
-    return state_dict
-
-
-def convert_transformer(ckpt_path: str):
-    original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", weights_only=True))
-
-    with init_empty_weights():
-        transformer = HunyuanVideoTransformer3DModel()
-
-    for key in list(original_state_dict.keys()):
-        new_key = key[:]
-        for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
-            new_key = new_key.replace(replace_key, rename_key)
-        update_state_dict_(original_state_dict, key, new_key)
-
-    for key in list(original_state_dict.keys()):
-        for special_key, handler_fn_inplace in TRANSFORMER_SPECIAL_KEYS_REMAP.items():
-            if special_key not in key:
-                continue
-            handler_fn_inplace(key, original_state_dict)
-
-    transformer.load_state_dict(original_state_dict, strict=True, assign=True)
-    return transformer
-
-
-def convert_vae(ckpt_path: str):
-    original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", weights_only=True))
-
-    with init_empty_weights():
-        vae = AutoencoderKLHunyuanVideo()
-
-    for key in list(original_state_dict.keys()):
-        new_key = key[:]
-        for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
-            new_key = new_key.replace(replace_key, rename_key)
-        update_state_dict_(original_state_dict, key, new_key)
-
-    for key in list(original_state_dict.keys()):
-        for special_key, handler_fn_inplace in VAE_SPECIAL_KEYS_REMAP.items():
-            if special_key not in key:
-                continue
-            handler_fn_inplace(key, original_state_dict)
-
-    vae.load_state_dict(original_state_dict, strict=True, assign=True)
-    return vae
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--transformer_ckpt_path", type=str, default=None, help="Path to original transformer checkpoint"
-    )
-    parser.add_argument("--vae_ckpt_path", type=str, default=None, help="Path to original VAE checkpoint")
-    parser.add_argument("--text_encoder_path", type=str, default=None, help="Path to original llama checkpoint")
-    parser.add_argument("--tokenizer_path", type=str, default=None, help="Path to original llama tokenizer")
-    parser.add_argument("--text_encoder_2_path", type=str, default=None, help="Path to original clip checkpoint")
-    parser.add_argument("--save_pipeline", action="store_true")
-    parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
-    parser.add_argument("--dtype", default="bf16", help="Torch dtype to save the transformer in.")
-    return parser.parse_args()
-
-
-DTYPE_MAPPING = {
-    "fp32": torch.float32,
-    "fp16": torch.float16,
-    "bf16": torch.bfloat16,
-}
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    transformer = None
-    dtype = DTYPE_MAPPING[args.dtype]
-
-    if args.save_pipeline:
-        assert args.transformer_ckpt_path is not None and args.vae_ckpt_path is not None
-        assert args.text_encoder_path is not None
-        assert args.tokenizer_path is not None
-        assert args.text_encoder_2_path is not None
-
-    if args.transformer_ckpt_path is not None:
-        transformer = convert_transformer(args.transformer_ckpt_path)
-        transformer = transformer.to(dtype=dtype)
-        if not args.save_pipeline:
-            transformer.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
-
-    if args.vae_ckpt_path is not None:
-        vae = convert_vae(args.vae_ckpt_path)
-        if not args.save_pipeline:
-            vae.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
-
-    if args.save_pipeline:
-        text_encoder = AutoModel.from_pretrained(args.text_encoder_path, torch_dtype=torch.float16)
-        tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path, padding_side="right")
-        text_encoder_2 = CLIPTextModel.from_pretrained(args.text_encoder_2_path, torch_dtype=torch.float16)
-        tokenizer_2 = CLIPTokenizer.from_pretrained(args.text_encoder_2_path)
-        scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
-
-        pipe = HunyuanVideoPipeline(
-            transformer=transformer,
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            text_encoder_2=text_encoder_2,
-            tokenizer_2=tokenizer_2,
-            scheduler=scheduler,
-        )
-        pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
@@ -1,297 +0,0 @@
-import argparse
-from pathlib import Path
-from typing import Any, Dict
-
-import torch
-from accelerate import init_empty_weights
-from safetensors.torch import load_file
-from transformers import T5EncoderModel, T5Tokenizer
-
-from diffusers import AutoencoderKLLTXVideo, FlowMatchEulerDiscreteScheduler, LTXPipeline, LTXVideoTransformer3DModel
-
-
-def remove_keys_(key: str, state_dict: Dict[str, Any]):
-    state_dict.pop(key)
-
-
-TOKENIZER_MAX_LENGTH = 128
-
-TRANSFORMER_KEYS_RENAME_DICT = {
-    "patchify_proj": "proj_in",
-    "adaln_single": "time_embed",
-    "q_norm": "norm_q",
-    "k_norm": "norm_k",
-}
-
-TRANSFORMER_SPECIAL_KEYS_REMAP = {
-    "vae": remove_keys_,
-}
-
-VAE_KEYS_RENAME_DICT = {
-    # decoder
-    "up_blocks.0": "mid_block",
-    "up_blocks.1": "up_blocks.0",
-    "up_blocks.2": "up_blocks.1.upsamplers.0",
-    "up_blocks.3": "up_blocks.1",
-    "up_blocks.4": "up_blocks.2.conv_in",
-    "up_blocks.5": "up_blocks.2.upsamplers.0",
-    "up_blocks.6": "up_blocks.2",
-    "up_blocks.7": "up_blocks.3.conv_in",
-    "up_blocks.8": "up_blocks.3.upsamplers.0",
-    "up_blocks.9": "up_blocks.3",
-    # encoder
-    "down_blocks.0": "down_blocks.0",
-    "down_blocks.1": "down_blocks.0.downsamplers.0",
-    "down_blocks.2": "down_blocks.0.conv_out",
-    "down_blocks.3": "down_blocks.1",
-    "down_blocks.4": "down_blocks.1.downsamplers.0",
-    "down_blocks.5": "down_blocks.1.conv_out",
-    "down_blocks.6": "down_blocks.2",
-    "down_blocks.7": "down_blocks.2.downsamplers.0",
-    "down_blocks.8": "down_blocks.3",
-    "down_blocks.9": "mid_block",
-    # common
-    "conv_shortcut": "conv_shortcut.conv",
-    "res_blocks": "resnets",
-    "norm3.norm": "norm3",
-    "per_channel_statistics.mean-of-means": "latents_mean",
-    "per_channel_statistics.std-of-means": "latents_std",
-}
-
-VAE_091_RENAME_DICT = {
-    # decoder
-    "up_blocks.0": "mid_block",
-    "up_blocks.1": "up_blocks.0.upsamplers.0",
-    "up_blocks.2": "up_blocks.0",
-    "up_blocks.3": "up_blocks.1.upsamplers.0",
-    "up_blocks.4": "up_blocks.1",
-    "up_blocks.5": "up_blocks.2.upsamplers.0",
-    "up_blocks.6": "up_blocks.2",
-    "up_blocks.7": "up_blocks.3.upsamplers.0",
-    "up_blocks.8": "up_blocks.3",
-    # common
-    "last_time_embedder": "time_embedder",
-    "last_scale_shift_table": "scale_shift_table",
-}
-
-VAE_SPECIAL_KEYS_REMAP = {
-    "per_channel_statistics.channel": remove_keys_,
-    "per_channel_statistics.mean-of-means": remove_keys_,
-    "per_channel_statistics.mean-of-stds": remove_keys_,
-    "model.diffusion_model": remove_keys_,
-}
-
-VAE_091_SPECIAL_KEYS_REMAP = {
-    "timestep_scale_multiplier": remove_keys_,
-}
-
-
-def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
-    state_dict = saved_dict
-    if "model" in saved_dict.keys():
-        state_dict = state_dict["model"]
-    if "module" in saved_dict.keys():
-        state_dict = state_dict["module"]
-    if "state_dict" in saved_dict.keys():
-        state_dict = state_dict["state_dict"]
-    return state_dict
-
-
-def update_state_dict_inplace(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
-    state_dict[new_key] = state_dict.pop(old_key)
-
-
-def convert_transformer(
-    ckpt_path: str,
-    dtype: torch.dtype,
-):
-    PREFIX_KEY = "model.diffusion_model."
-
-    original_state_dict = get_state_dict(load_file(ckpt_path))
-    with init_empty_weights():
-        transformer = LTXVideoTransformer3DModel()
-
-    for key in list(original_state_dict.keys()):
-        new_key = key[:]
-        if new_key.startswith(PREFIX_KEY):
-            new_key = key[len(PREFIX_KEY) :]
-        for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
-            new_key = new_key.replace(replace_key, rename_key)
-        update_state_dict_inplace(original_state_dict, key, new_key)
-
-    for key in list(original_state_dict.keys()):
-        for special_key, handler_fn_inplace in TRANSFORMER_SPECIAL_KEYS_REMAP.items():
-            if special_key not in key:
-                continue
-            handler_fn_inplace(key, original_state_dict)
-
-    transformer.load_state_dict(original_state_dict, strict=True, assign=True)
-    return transformer
-
-
-def convert_vae(ckpt_path: str, config, dtype: torch.dtype):
-    PREFIX_KEY = "vae."
-
-    original_state_dict = get_state_dict(load_file(ckpt_path))
-    with init_empty_weights():
-        vae = AutoencoderKLLTXVideo(**config)
-
-    for key in list(original_state_dict.keys()):
-        new_key = key[:]
-        if new_key.startswith(PREFIX_KEY):
-            new_key = key[len(PREFIX_KEY) :]
-        for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
-            new_key = new_key.replace(replace_key, rename_key)
-        update_state_dict_inplace(original_state_dict, key, new_key)
-
-    for key in list(original_state_dict.keys()):
-        for special_key, handler_fn_inplace in VAE_SPECIAL_KEYS_REMAP.items():
-            if special_key not in key:
-                continue
-            handler_fn_inplace(key, original_state_dict)
-
-    vae.load_state_dict(original_state_dict, strict=True, assign=True)
-    return vae
-
-
-def get_vae_config(version: str) -> Dict[str, Any]:
-    if version == "0.9.0":
-        config = {
-            "in_channels": 3,
-            "out_channels": 3,
-            "latent_channels": 128,
-            "block_out_channels": (128, 256, 512, 512),
-            "decoder_block_out_channels": (128, 256, 512, 512),
-            "layers_per_block": (4, 3, 3, 3, 4),
-            "decoder_layers_per_block": (4, 3, 3, 3, 4),
-            "spatio_temporal_scaling": (True, True, True, False),
-            "decoder_spatio_temporal_scaling": (True, True, True, False),
-            "decoder_inject_noise": (False, False, False, False, False),
-            "upsample_residual": (False, False, False, False),
-            "upsample_factor": (1, 1, 1, 1),
-            "patch_size": 4,
-            "patch_size_t": 1,
-            "resnet_norm_eps": 1e-6,
-            "scaling_factor": 1.0,
-            "encoder_causal": True,
-            "decoder_causal": False,
-            "timestep_conditioning": False,
-        }
-    elif version == "0.9.1":
-        config = {
-            "in_channels": 3,
-            "out_channels": 3,
-            "latent_channels": 128,
-            "block_out_channels": (128, 256, 512, 512),
-            "decoder_block_out_channels": (256, 512, 1024),
-            "layers_per_block": (4, 3, 3, 3, 4),
-            "decoder_layers_per_block": (5, 6, 7, 8),
-            "spatio_temporal_scaling": (True, True, True, False),
-            "decoder_spatio_temporal_scaling": (True, True, True),
-            "decoder_inject_noise": (True, True, True, False),
-            "upsample_residual": (True, True, True),
-            "upsample_factor": (2, 2, 2),
-            "timestep_conditioning": True,
-            "patch_size": 4,
-            "patch_size_t": 1,
-            "resnet_norm_eps": 1e-6,
-            "scaling_factor": 1.0,
-            "encoder_causal": True,
-            "decoder_causal": False,
-        }
-        VAE_KEYS_RENAME_DICT.update(VAE_091_RENAME_DICT)
-        VAE_SPECIAL_KEYS_REMAP.update(VAE_091_SPECIAL_KEYS_REMAP)
-    return config
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--transformer_ckpt_path", type=str, default=None, help="Path to original transformer checkpoint"
-    )
-    parser.add_argument("--vae_ckpt_path", type=str, default=None, help="Path to original vae checkpoint")
-    parser.add_argument(
-        "--text_encoder_cache_dir", type=str, default=None, help="Path to text encoder cache directory"
-    )
-    parser.add_argument(
-        "--typecast_text_encoder",
-        action="store_true",
-        default=False,
-        help="Whether or not to apply fp16/bf16 precision to text_encoder",
-    )
-    parser.add_argument("--save_pipeline", action="store_true")
-    parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
-    parser.add_argument("--dtype", default="fp32", help="Torch dtype to save the model in.")
-    parser.add_argument(
-        "--version", type=str, default="0.9.0", choices=["0.9.0", "0.9.1"], help="Version of the LTX model"
-    )
-    return parser.parse_args()
-
-
-DTYPE_MAPPING = {
-    "fp32": torch.float32,
-    "fp16": torch.float16,
-    "bf16": torch.bfloat16,
-}
-
-VARIANT_MAPPING = {
-    "fp32": None,
-    "fp16": "fp16",
-    "bf16": "bf16",
-}
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    transformer = None
-    dtype = DTYPE_MAPPING[args.dtype]
-    variant = VARIANT_MAPPING[args.dtype]
-    output_path = Path(args.output_path)
-
-    if args.save_pipeline:
-        assert args.transformer_ckpt_path is not None and args.vae_ckpt_path is not None
-
-    if args.transformer_ckpt_path is not None:
-        transformer: LTXVideoTransformer3DModel = convert_transformer(args.transformer_ckpt_path, dtype)
-        if not args.save_pipeline:
-            transformer.save_pretrained(
-                output_path / "transformer", safe_serialization=True, max_shard_size="5GB", variant=variant
-            )
-
-    if args.vae_ckpt_path is not None:
-        config = get_vae_config(args.version)
-        vae: AutoencoderKLLTXVideo = convert_vae(args.vae_ckpt_path, config, dtype)
-        if not args.save_pipeline:
-            vae.save_pretrained(output_path / "vae", safe_serialization=True, max_shard_size="5GB", variant=variant)
-
-    if args.save_pipeline:
-        text_encoder_id = "google/t5-v1_1-xxl"
-        tokenizer = T5Tokenizer.from_pretrained(text_encoder_id, model_max_length=TOKENIZER_MAX_LENGTH)
-        text_encoder = T5EncoderModel.from_pretrained(text_encoder_id, cache_dir=args.text_encoder_cache_dir)
-
-        if args.typecast_text_encoder:
-            text_encoder = text_encoder.to(dtype=dtype)
-
-        # Apparently, the conversion does not work anymore without this :shrug:
-        for param in text_encoder.parameters():
-            param.data = param.data.contiguous()
-
-        scheduler = FlowMatchEulerDiscreteScheduler(
-            use_dynamic_shifting=True,
-            base_shift=0.95,
-            max_shift=2.05,
-            base_image_seq_len=1024,
-            max_image_seq_len=4096,
-            shift_terminal=0.1,
-        )
-
-        pipe = LTXPipeline(
-            scheduler=scheduler,
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            transformer=transformer,
-        )
-
-        pipe.save_pretrained(args.output_path, safe_serialization=True, variant=variant, max_shard_size="5GB")
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Marc Sun	1cd5155bb8	remove print	2024-12-04 13:04:48 +00:00
Marc Sun	b14bffeffe	first draft	2024-12-04 13:03:35 +00:00
Marc Sun	e66c4d0dab	Update src/diffusers/pipelines/pipeline_utils.py Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-12-04 13:57:21 +01:00
Marc Sun	7d2c7d5553	Update src/diffusers/pipelines/pipeline_utils.py Co-authored-by: Sayak Paul <spsayakpaul@gmail.com>	2024-12-04 13:56:21 +01:00
Sayak Paul	78135f1478	Merge branch 'main' into dduf	2024-12-04 17:34:58 +05:30
sayakpaul	d8408677c5	updates	2024-12-03 14:06:47 +00:00
Sayak Paul	cbee7cbc6b	Merge branch 'main' into dduf	2024-11-30 08:56:15 +05:30
Marc Sun	2eeda25321	switch to zip uncompressed	2024-11-28 16:06:04 +01:00
Marc Sun	0389333113	style	2024-11-27 18:01:43 +01:00
Marc Sun	1fb86e34c0	load and save dduf archive	2024-11-27 18:01:36 +01:00