start fixing nan

2024-10-18 14:39:08 +05:30
269 changed files with 4277 additions and 23006 deletions
@@ -180,62 +180,6 @@ jobs:
        pip install slack_sdk tabulate
        python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

-  run_big_gpu_torch_tests:
-    name: Torch tests on big GPU
-    strategy:
-      fail-fast: false
-      max-parallel: 2
-    runs-on:
-      group: aws-g6e-xlarge-plus
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "16gb" --ipc host --gpus 0
-    steps:
-      - name: Checkout diffusers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-      - name: NVIDIA-SMI
-        run: nvidia-smi
-      - name: Install dependencies
-        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          python -m uv pip install peft@git+https://github.com/huggingface/peft.git
-          pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
-          python -m uv pip install pytest-reportlog
-      - name: Environment
-        run: |
-          python utils/print_env.py
-      - name: Selected Torch CUDA Test on big GPU
-        env:
-          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
-          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-          CUBLAS_WORKSPACE_CONFIG: :16:8
-          BIG_GPU_MEMORY: 40
-        run: |
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-            -m "big_gpu_with_torch_cuda" \
-            --make-reports=tests_big_gpu_torch_cuda \
-            --report-log=tests_big_gpu_torch_cuda.log \
-            tests/
-      - name: Failure short reports
-        if: ${{ failure() }}
-        run: |
-          cat reports/tests_big_gpu_torch_cuda_stats.txt
-          cat reports/tests_big_gpu_torch_cuda_failures_short.txt
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: torch_cuda_big_gpu_test_reports
-          path: reports
-      - name: Generate Report and Notify Channel
-        if: always()
-        run: |
-          pip install slack_sdk tabulate
-          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
-
  run_flax_tpu_tests:
    name: Nightly Flax TPU Tests
    runs-on: docker-tpu
@@ -92,14 +92,12 @@ jobs:
      run: |
        python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
        python -m uv pip install -e [quality,test]
-        # TODO (sayakpaul, DN6): revisit `--no-deps`
        if [ "${{ matrix.lib-versions }}" == "main" ]; then
-            python -m pip install -U peft@git+https://github.com/huggingface/peft.git --no-deps
-            python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git --no-deps
-            pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git --no-deps
+            python -m pip install -U peft@git+https://github.com/huggingface/peft.git
+            python -m uv pip install -U transformers@git+https://github.com/huggingface/transformers.git
+            pip uninstall accelerate -y && python -m uv pip install -U accelerate@git+https://github.com/huggingface/accelerate.git
        else
-            python -m uv pip install -U peft --no-deps
-            python -m uv pip install -U transformers accelerate --no-deps
+            python -m uv pip install -U peft transformers accelerate
        fi

    - name: Environment
@@ -81,7 +81,7 @@ jobs:
      - name: Environment
        run: |
          python utils/print_env.py
-      - name: PyTorch CUDA checkpoint tests on Ubuntu
+      - name: Slow PyTorch CUDA checkpoint tests on Ubuntu
        env:
          HF_TOKEN: ${{ secrets.HF_TOKEN }}
          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
@@ -184,7 +184,7 @@ jobs:
      run: |
        python utils/print_env.py

-    - name: Run Flax TPU tests
+    - name: Run slow Flax TPU tests
      env:
        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
@@ -232,7 +232,7 @@ jobs:
      run: |
        python utils/print_env.py

-    - name: Run ONNXRuntime CUDA tests
+    - name: Run slow ONNXRuntime CUDA tests
      env:
        HF_TOKEN: ${{ secrets.HF_TOKEN }}
      run: |
@@ -4,13 +4,12 @@ on:
  workflow_dispatch:
    inputs:
      runner_type:
-        description: 'Type of runner to test (aws-g6-4xlarge-plus: a10, aws-g4dn-2xlarge: t4, aws-g6e-xlarge-plus: L40)'
+        description: 'Type of runner to test (aws-g6-4xlarge-plus: a10 or aws-g4dn-2xlarge: t4)'
        type: choice
        required: true
        options:
          - aws-g6-4xlarge-plus
          - aws-g4dn-2xlarge
-          - aws-g6e-xlarge-plus
      docker_image:
        description: 'Name of the Docker image'
        required: true
@@ -28,7 +28,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    python3.10 -m uv pip install --no-cache-dir \
-        "torch<2.5.0" \
+        torch \
        torchvision \
        torchaudio \
        "onnxruntime-gpu>=1.13.1" \
@@ -29,7 +29,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    python3.10 -m uv pip install --no-cache-dir \
-    "torch<2.5.0" \
+    torch \
    torchvision \
    torchaudio \
    invisible_watermark && \
@@ -29,7 +29,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    python3.10 -m uv pip install --no-cache-dir \
-        "torch<2.5.0" \
+        torch \
        torchvision \
        torchaudio \
        invisible_watermark \
@@ -29,7 +29,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    python3.10 -m uv pip install --no-cache-dir \
-    "torch<2.5.0" \
+    torch \
    torchvision \
    torchaudio \
    invisible_watermark && \
@@ -29,7 +29,7 @@ ENV PATH="/opt/venv/bin:$PATH"
 # pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
 RUN python3.10 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
    python3.10 -m pip install --no-cache-dir \
-        "torch<2.5.0" \
+        torch \
        torchvision \
        torchaudio \
        invisible_watermark && \
@@ -150,12 +150,6 @@
      title: Reinforcement learning training with DDPO
    title: Methods
  title: Training
- sections:
-  - local: quantization/overview
-    title: Getting Started
-  - local: quantization/bitsandbytes
-    title: bitsandbytes
-  title: Quantization Methods
 - sections:
  - local: optimization/fp16
    title: Speed up inference
@@ -188,8 +182,6 @@
      title: Metal Performance Shaders (MPS)
    - local: optimization/habana
      title: Habana Gaudi
-    - local: optimization/neuron
-      title: AWS Neuron
    title: Optimized hardware
  title: Accelerate inference and reduce memory
 - sections:
@@ -217,8 +209,6 @@
      title: Logging
    - local: api/outputs
      title: Outputs
-    - local: api/quantization
-      title: Quantization
    title: Main Classes
  - isExpanded: false
    sections:
@@ -252,8 +242,6 @@
        title: SparseControlNetModel
      title: ControlNets
    - sections:
-      - local: api/models/allegro_transformer3d
-        title: AllegroTransformer3DModel
      - local: api/models/aura_flow_transformer2d
        title: AuraFlowTransformer2DModel
      - local: api/models/cogvideox_transformer3d
@@ -270,8 +258,6 @@
        title: LatteTransformer3DModel
      - local: api/models/lumina_nextdit2d
        title: LuminaNextDiT2DModel
-      - local: api/models/mochi_transformer3d
-        title: MochiTransformer3DModel
      - local: api/models/pixart_transformer2d
        title: PixArtTransformer2DModel
      - local: api/models/prior_transformer
@@ -304,12 +290,8 @@
    - sections:
      - local: api/models/autoencoderkl
        title: AutoencoderKL
-      - local: api/models/autoencoderkl_allegro
-        title: AutoencoderKLAllegro
      - local: api/models/autoencoderkl_cogvideox
        title: AutoencoderKLCogVideoX
-      - local: api/models/autoencoderkl_mochi
-        title: AutoencoderKLMochi
      - local: api/models/asymmetricautoencoderkl
        title: AsymmetricAutoencoderKL
      - local: api/models/consistency_decoder_vae
@@ -326,8 +308,6 @@
    sections:
    - local: api/pipelines/overview
      title: Overview
-    - local: api/pipelines/allegro
-      title: Allegro
    - local: api/pipelines/amused
      title: aMUSEd
    - local: api/pipelines/animatediff
@@ -404,8 +384,6 @@
      title: Lumina-T2X
    - local: api/pipelines/marigold
      title: Marigold
-    - local: api/pipelines/mochi
-      title: Mochi
    - local: api/pipelines/panorama
      title: MultiDiffusion
    - local: api/pipelines/musicldm
@@ -1,30 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# AllegroTransformer3DModel
-
-A Diffusion Transformer model for 3D data from [Allegro](https://github.com/rhymes-ai/Allegro) was introduced in [Allegro: Open the Black Box of Commercial-Level Video Generation Model](https://huggingface.co/papers/2410.15458) by RhymesAI.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import AllegroTransformer3DModel
-
-vae = AllegroTransformer3DModel.from_pretrained("rhymes-ai/Allegro", subfolder="transformer", torch_dtype=torch.bfloat16).to("cuda")
-```
-
-## AllegroTransformer3DModel
-
-[[autodoc]] AllegroTransformer3DModel
-
-## Transformer2DModelOutput
-
-[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
@@ -1,37 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# AutoencoderKLAllegro
-
-The 3D variational autoencoder (VAE) model with KL loss used in [Allegro](https://github.com/rhymes-ai/Allegro) was introduced in [Allegro: Open the Black Box of Commercial-Level Video Generation Model](https://huggingface.co/papers/2410.15458) by RhymesAI.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import AutoencoderKLAllegro
-
-vae = AutoencoderKLCogVideoX.from_pretrained("rhymes-ai/Allegro", subfolder="vae", torch_dtype=torch.float32).to("cuda")
-```
-
-## AutoencoderKLAllegro
-
-[[autodoc]] AutoencoderKLAllegro
-    - decode
-    - encode
-    - all
-
-## AutoencoderKLOutput
-
-[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
-
-## DecoderOutput
-
-[[autodoc]] models.autoencoders.vae.DecoderOutput
@@ -1,32 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# AutoencoderKLMochi
-
-The 3D variational autoencoder (VAE) model with KL loss used in [Mochi](https://github.com/genmoai/models) was introduced in [Mochi 1 Preview](https://huggingface.co/genmo/mochi-1-preview) by Tsinghua University & ZhipuAI.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import AutoencoderKLMochi
-
-vae = AutoencoderKLMochi.from_pretrained("genmo/mochi-1-preview", subfolder="vae", torch_dtype=torch.float32).to("cuda")
-```
-
-## AutoencoderKLMochi
-
-[[autodoc]] AutoencoderKLMochi
-    - decode
-    - all
-
-## DecoderOutput
-
-[[autodoc]] models.autoencoders.vae.DecoderOutput
@@ -39,7 +39,7 @@ pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=contro

 ## ControlNetOutput

-[[autodoc]] models.controlnets.controlnet.ControlNetOutput
+[[autodoc]] models.controlnet.ControlNetOutput

 ## FlaxControlNetModel

@@ -47,4 +47,4 @@ pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=contro

 ## FlaxControlNetOutput

-[[autodoc]] models.controlnets.controlnet_flax.FlaxControlNetOutput
+[[autodoc]] models.controlnet_flax.FlaxControlNetOutput
@@ -38,5 +38,5 @@ pipe = StableDiffusion3ControlNetPipeline.from_pretrained("stabilityai/stable-di

 ## SD3ControlNetOutput

-[[autodoc]] models.controlnets.controlnet_sd3.SD3ControlNetOutput
+[[autodoc]] models.controlnet_sd3.SD3ControlNetOutput

@@ -1,30 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# MochiTransformer3DModel
-
-A Diffusion Transformer model for 3D video-like data was introduced in [Mochi-1 Preview](https://huggingface.co/genmo/mochi-1-preview) by Genmo.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import MochiTransformer3DModel
-
-vae = MochiTransformer3DModel.from_pretrained("genmo/mochi-1-preview", subfolder="transformer", torch_dtype=torch.float16).to("cuda")
-```
-
-## MochiTransformer3DModel
-
-[[autodoc]] MochiTransformer3DModel
-
-## Transformer2DModelOutput
-
-[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
@@ -1,34 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# Allegro
-
-[Allegro: Open the Black Box of Commercial-Level Video Generation Model](https://huggingface.co/papers/2410.15458) from RhymesAI, by Yuan Zhou, Qiuyue Wang, Yuxuan Cai, Huan Yang.
-
-The abstract from the paper is:
-
-*Significant advancements have been made in the field of video generation, with the open-source community contributing a wealth of research papers and tools for training high-quality models. However, despite these efforts, the available information and resources remain insufficient for achieving commercial-level performance. In this report, we open the black box and introduce Allegro, an advanced video generation model that excels in both quality and temporal consistency. We also highlight the current limitations in the field and present a comprehensive methodology for training high-performance, commercial-level video generation models, addressing key aspects such as data, model architecture, training pipeline, and evaluation. Our user study shows that Allegro surpasses existing open-source models and most commercial models, ranking just behind Hailuo and Kling. Code: https://github.com/rhymes-ai/Allegro , Model: https://huggingface.co/rhymes-ai/Allegro , Gallery: https://rhymes.ai/allegro_gallery .*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## AllegroPipeline
-
-[[autodoc]] AllegroPipeline
-  - all
-  - __call__
-
-## AllegroPipelineOutput
-
-[[autodoc]] pipelines.allegro.pipeline_output.AllegroPipelineOutput
@@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team, The InstantX Team, and the XLabs Team. All rights reserved.
+<!--Copyright 2024 The HuggingFace Team and The InstantX Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@@ -31,14 +31,6 @@ This controlnet code is implemented by [The InstantX Team](https://huggingface.c
 | Depth | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/Shakker-Labs/FLUX.1-dev-ControlNet-Depth) |
 | Union | [The InstantX Team](https://huggingface.co/InstantX) | [Link](https://huggingface.co/InstantX/FLUX.1-dev-Controlnet-Union) |

-XLabs ControlNets are also supported, which was contributed by the [XLabs team](https://huggingface.co/XLabs-AI).
-
-| ControlNet type | Developer | Link |
-| -------- | ---------- | ---- |
-| Canny | [The XLabs Team](https://huggingface.co/XLabs-AI) | [Link](https://huggingface.co/XLabs-AI/flux-controlnet-canny-diffusers) |
-| Depth | [The XLabs Team](https://huggingface.co/XLabs-AI) | [Link](https://huggingface.co/XLabs-AI/flux-controlnet-depth-diffusers) |
-| HED | [The XLabs Team](https://huggingface.co/XLabs-AI) | [Link](https://huggingface.co/XLabs-AI/flux-controlnet-hed-diffusers) |
-

 <Tip>

@@ -1,36 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-->
-
-# Mochi
-
-[Mochi 1 Preview](https://huggingface.co/genmo/mochi-1-preview) from Genmo.
-
-*Mochi 1 preview is an open state-of-the-art video generation model with high-fidelity motion and strong prompt adherence in preliminary evaluation. This model dramatically closes the gap between closed and open video generation systems. The model is released under a permissive Apache 2.0 license.*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers.md) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading.md#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## MochiPipeline
-
-[[autodoc]] MochiPipeline
-  - all
-  - __call__
-
-## MochiPipelineOutput
-
-[[autodoc]] pipelines.mochi.pipeline_output.MochiPipelineOutput
@@ -54,11 +54,6 @@ image = pipe(
 image.save("sd3_hello_world.png")
 ```

-**Note:** Stable Diffusion 3.5 can also be run using the SD3 pipeline, and all mentioned optimizations and techniques apply to it as well. In total there are three official models in the SD3 family:
- [`stabilityai/stable-diffusion-3-medium-diffusers`](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers)
- [`stabilityai/stable-diffusion-3.5-large`](https://huggingface.co/stabilityai/stable-diffusion-3-5-large)
- [`stabilityai/stable-diffusion-3.5-large-turbo`](https://huggingface.co/stabilityai/stable-diffusion-3-5-large-turbo)
-
 ## Memory Optimisations for SD3

 SD3 uses three text encoders, one if which is the very large T5-XXL model. This makes it challenging to run the model on GPUs with less than 24GB of VRAM, even when using `fp16` precision. The following section outlines a few memory optimizations in Diffusers that make it easier to run SD3 on low resource hardware.
@@ -313,26 +308,6 @@ image = pipe("a picture of a cat holding a sign that says hello world").images[0
 image.save('sd3-single-file-t5-fp8.png')
 ```

-### Loading the single file checkpoint for the Stable Diffusion 3.5 Transformer Model
-
-```python
-import torch
-from diffusers import SD3Transformer2DModel, StableDiffusion3Pipeline
-
-transformer = SD3Transformer2DModel.from_single_file(
-    "https://huggingface.co/stabilityai/stable-diffusion-3.5-large-turbo/blob/main/sd3.5_large.safetensors",
-    torch_dtype=torch.bfloat16,
-)
-pipe = StableDiffusion3Pipeline.from_pretrained(
-    "stabilityai/stable-diffusion-3.5-large",
-    transformer=transformer,
-    torch_dtype=torch.bfloat16,
-)
-pipe.enable_model_cpu_offload()
-image = pipe("a cat holding a sign that says hello world").images[0]
-image.save("sd35.png")
-```
-
 ## StableDiffusion3Pipeline

 [[autodoc]] StableDiffusion3Pipeline
@@ -1,33 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-->
-
-# Quantization
-
-Quantization techniques reduce memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference. Diffusers supports 8-bit and 4-bit quantization with [bitsandbytes](https://huggingface.co/docs/bitsandbytes/en/index).
-
-Quantization techniques that aren't supported in Transformers can be added with the [`DiffusersQuantizer`] class.
-
-<Tip>
-
-Learn how to quantize models in the [Quantization](../quantization/overview) guide.
-
-</Tip>
-
-
-## BitsAndBytesConfig
-
-[[autodoc]] BitsAndBytesConfig
-
-## DiffusersQuantizer
-
-[[autodoc]] quantizers.base.DiffusersQuantizer
@@ -1,61 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# AWS Neuron
-
-Diffusers functionalities are available on [AWS Inf2 instances](https://aws.amazon.com/ec2/instance-types/inf2/), which are EC2 instances powered by [Neuron machine learning accelerators](https://aws.amazon.com/machine-learning/inferentia/). These instances aim to provide better compute performance (higher throughput, lower latency) with good cost-efficiency, making them good candidates for AWS users to deploy diffusion models to production.
-
-[Optimum Neuron](https://huggingface.co/docs/optimum-neuron/en/index) is the interface between Hugging Face libraries and AWS Accelerators, including AWS [Trainium](https://aws.amazon.com/machine-learning/trainium/) and AWS [Inferentia](https://aws.amazon.com/machine-learning/inferentia/). It supports many of the features in Diffusers with similar APIs, so it is easier to learn if you're already familiar with Diffusers. Once you have created an AWS Inf2 instance, install Optimum Neuron.
-
-```bash
-python -m pip install --upgrade-strategy eager optimum[neuronx]
-```
-
-<Tip>
-
-We provide pre-built [Hugging Face Neuron Deep Learning AMI](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) (DLAMI) and Optimum Neuron containers for Amazon SageMaker. It's recommended to correctly set up your environment.
-
-</Tip>
-
-The example below demonstrates how to generate images with the Stable Diffusion XL model on an inf2.8xlarge instance (you can switch to cheaper inf2.xlarge instances once the model is compiled). To generate some images, use the [`~optimum.neuron.NeuronStableDiffusionXLPipeline`] class, which is similar to the [`StableDiffusionXLPipeline`] class in Diffusers.
-
-Unlike Diffusers, you need to compile models in the pipeline to the Neuron format, `.neuron`. Launch the following command to export the model to the `.neuron` format.
-
-```bash
-optimum-cli export neuron --model stabilityai/stable-diffusion-xl-base-1.0 \
-  --batch_size 1 \
-  --height 1024 `# height in pixels of generated image, eg. 768, 1024` \
-  --width 1024 `# width in pixels of generated image, eg. 768, 1024` \
-  --num_images_per_prompt 1 `# number of images to generate per prompt, defaults to 1` \
-  --auto_cast matmul `# cast only matrix multiplication operations` \
-  --auto_cast_type bf16 `# cast operations from FP32 to BF16` \
-  sd_neuron_xl/
-```
-
-Now generate some images with the pre-compiled SDXL model.
-
-```python
->>> from optimum.neuron import NeuronStableDiffusionXLPipeline
-
->>> stable_diffusion_xl = NeuronStableDiffusionXLPipeline.from_pretrained("sd_neuron_xl/")
->>> prompt = "a pig with wings flying in floating US dollar banknotes in the air, skyscrapers behind, warm color palette, muted colors, detailed, 8k"
->>> image = stable_diffusion_xl(prompt).images[0]
-```
-
-<img
-  src="https://huggingface.co/datasets/Jingya/document_images/resolve/main/optimum/neuron/sdxl_pig.png"
-  width="256"
-  height="256"
-  alt="peggy generated by sdxl on inf2"
-/>
-
-Feel free to check out more guides and examples on different use cases from the Optimum Neuron [documentation](https://huggingface.co/docs/optimum-neuron/en/inference_tutorials/stable_diffusion#generate-images-with-stable-diffusion-models-on-aws-inferentia)!
@@ -1,260 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-->
-
-# bitsandbytes
-
-[bitsandbytes](https://huggingface.co/docs/bitsandbytes/index) is the easiest option for quantizing a model to 8 and 4-bit. 8-bit quantization multiplies outliers in fp16 with non-outliers in int8, converts the non-outlier values back to fp16, and then adds them together to return the weights in fp16. This reduces the degradative effect outlier values have on a model's performance.
-
-4-bit quantization compresses a model even further, and it is commonly used with [QLoRA](https://hf.co/papers/2305.14314) to finetune quantized LLMs.
-
-
-To use bitsandbytes, make sure you have the following libraries installed:
-
-```bash
-pip install diffusers transformers accelerate bitsandbytes -U
-```
-
-Now you can quantize a model by passing a [`BitsAndBytesConfig`] to [`~ModelMixin.from_pretrained`]. This works for any model in any modality, as long as it supports loading with [Accelerate](https://hf.co/docs/accelerate/index) and contains `torch.nn.Linear` layers.
-
-<hfoptions id="bnb">
-<hfoption id="8-bit">
-
-Quantizing a model in 8-bit halves the memory-usage:
-
-```py
-from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-
-model_8bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev", 
-    subfolder="transformer",
-    quantization_config=quantization_config
-)
-```
-
-By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
-
-```py
-from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(load_in_8bit=True)
-
-model_8bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev", 
-    subfolder="transformer",
-    quantization_config=quantization_config,
-    torch_dtype=torch.float32
-)
-model_8bit.transformer_blocks.layers[-1].norm2.weight.dtype
-```
-
-Once a model is quantized, you can push the model to the Hub with the [`~ModelMixin.push_to_hub`] method. The quantization `config.json` file is pushed first, followed by the quantized model weights. You can also save the serialized 4-bit models locally with [`~ModelMixin.save_pretrained`].
-
-</hfoption>
-<hfoption id="4-bit">
-
-Quantizing a model in 4-bit reduces your memory-usage by 4x:
-
-```py
-from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-
-model_4bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev", 
-    subfolder="transformer",
-    quantization_config=quantization_config
-)
-```
-
-By default, all the other modules such as `torch.nn.LayerNorm` are converted to `torch.float16`. You can change the data type of these modules with the `torch_dtype` parameter if you want:
-
-```py
-from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-
-model_4bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev", 
-    subfolder="transformer",
-    quantization_config=quantization_config,
-    torch_dtype=torch.float32
-)
-model_4bit.transformer_blocks.layers[-1].norm2.weight.dtype
-```
-
-Call [`~ModelMixin.push_to_hub`] after loading it in 4-bit precision. You can also save the serialized 4-bit models locally with [`~ModelMixin.save_pretrained`].  
-
-</hfoption>
-</hfoptions>
-
-<Tip warning={true}>
-
-Training with 8-bit and 4-bit weights are only supported for training *extra* parameters.
-
-</Tip>
-
-Check your memory footprint with the `get_memory_footprint` method:
-
-```py
-print(model.get_memory_footprint())
-```
-
-Quantized models can be loaded from the [`~ModelMixin.from_pretrained`] method without needing to specify the `quantization_config` parameters:
-
-```py
-from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(load_in_4bit=True)
-
-model_4bit = FluxTransformer2DModel.from_pretrained(
-    "hf-internal-testing/flux.1-dev-nf4-pkg", subfolder="transformer"
-)
-```
-
-## 8-bit (LLM.int8() algorithm)
-
-<Tip>
-
-Learn more about the details of 8-bit quantization in this [blog post](https://huggingface.co/blog/hf-bitsandbytes-integration)!
-
-</Tip>
-
-This section explores some of the specific features of 8-bit models, such as outlier thresholds and skipping module conversion.
-
-### Outlier threshold
-
-An "outlier" is a hidden state value greater than a certain threshold, and these values are computed in fp16. While the values are usually normally distributed ([-3.5, 3.5]), this distribution can be very different for large models ([-60, 6] or [6, 60]). 8-bit quantization works well for values ~5, but beyond that, there is a significant performance penalty. A good default threshold value is 6, but a lower threshold may be needed for more unstable models (small models or finetuning).
-
-To find the best threshold for your model, we recommend experimenting with the `llm_int8_threshold` parameter in [`BitsAndBytesConfig`]:
-
-```py
-from diffusers import FluxTransformer2DModel, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(
-    load_in_8bit=True, llm_int8_threshold=10,
-)
-
-model_8bit = FluxTransformer2DModel.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    subfolder="transformer",
-    quantization_config=quantization_config,
-)
-```
-
-### Skip module conversion
-
-For some models, you don't need to quantize every module to 8-bit which can actually cause instability. For example, for diffusion models like [Stable Diffusion 3](../api/pipelines/stable_diffusion/stable_diffusion_3), the `proj_out` module can be skipped using the `llm_int8_skip_modules` parameter in [`BitsAndBytesConfig`]:
-
-```py
-from diffusers import SD3Transformer2DModel, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(
-    load_in_8bit=True, llm_int8_skip_modules=["proj_out"],
-)
-
-model_8bit = SD3Transformer2DModel.from_pretrained(
-    "stabilityai/stable-diffusion-3-medium-diffusers",
-    subfolder="transformer",
-    quantization_config=quantization_config,
-)
-```
-
-
-## 4-bit (QLoRA algorithm)
-
-<Tip>
-
-Learn more about its details in this [blog post](https://huggingface.co/blog/4bit-transformers-bitsandbytes).
-
-</Tip>
-
-This section explores some of the specific features of 4-bit models, such as changing the compute data type, using the Normal Float 4 (NF4) data type, and using nested quantization.
-
-
-### Compute data type
-
-To speedup computation, you can change the data type from float32 (the default value) to bf16 using the `bnb_4bit_compute_dtype` parameter in [`BitsAndBytesConfig`]:
-
-```py
-import torch
-from diffusers import BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
-```
-
-### Normal Float 4 (NF4)
-
-NF4 is a 4-bit data type from the [QLoRA](https://hf.co/papers/2305.14314) paper, adapted for weights initialized from a normal distribution. You should use NF4 for training 4-bit base models. This can be configured with the `bnb_4bit_quant_type` parameter in the [`BitsAndBytesConfig`]:
-
-```py
-from diffusers import BitsAndBytesConfig
-
-nf4_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-)
-
-model_nf4 = SD3Transformer2DModel.from_pretrained(
-    "stabilityai/stable-diffusion-3-medium-diffusers",
-    subfolder="transformer",
-    quantization_config=nf4_config,
-)
-```
-
-For inference, the `bnb_4bit_quant_type` does not have a huge impact on performance. However, to remain consistent with the model weights, you should use the `bnb_4bit_compute_dtype` and `torch_dtype` values.
-
-### Nested quantization
-
-Nested quantization is a technique that can save additional memory at no additional performance cost. This feature performs a second quantization of the already quantized weights to save an additional 0.4 bits/parameter. 
-
-```py
-from diffusers import BitsAndBytesConfig
-
-double_quant_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-)
-
-double_quant_model = SD3Transformer2DModel.from_pretrained(
-    "stabilityai/stable-diffusion-3-medium-diffusers",
-    subfolder="transformer",
-    quantization_config=double_quant_config,
-)
-```
-
-## Dequantizing `bitsandbytes` models
-
-Once quantized, you can dequantize the model to the original precision but this might result in a small quality loss of the model. Make sure you have enough GPU RAM to fit the dequantized model. 
-
-```python
-from diffusers import BitsAndBytesConfig
-
-double_quant_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_use_double_quant=True,
-)
-
-double_quant_model = SD3Transformer2DModel.from_pretrained(
-    "stabilityai/stable-diffusion-3-medium-diffusers",
-    subfolder="transformer",
-    quantization_config=double_quant_config,
-)
-model.dequantize()
-```
-
-## Resources
-
-* [End-to-end notebook showing Flux.1 Dev inference in a free-tier Colab](https://gist.github.com/sayakpaul/c76bd845b48759e11687ac550b99d8b4)
-* [Training](https://gist.github.com/sayakpaul/05afd428bc089b47af7c016e42004527)
@@ -1,35 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-->
-
-# Quantization
-
-Quantization techniques focus on representing data with less information while also trying to not lose too much accuracy. This often means converting a data type to represent the same information with fewer bits. For example, if your model weights are stored as 32-bit floating points and they're quantized to 16-bit floating points, this halves the model size which makes it easier to store and reduces memory-usage. Lower precision can also speedup inference because it takes less time to perform calculations with fewer bits.
-
-<Tip>
-
-Interested in adding a new quantization method to Transformers? Refer to the [Contribute new quantization method guide](https://huggingface.co/docs/transformers/main/en/quantization/contribute) to learn more about adding a new quantization method.
-
-</Tip>
-
-<Tip>
-
-If you are new to the quantization field, we recommend you to check out these beginner-friendly courses about quantization in collaboration with DeepLearning.AI:
-
-* [Quantization Fundamentals with Hugging Face](https://www.deeplearning.ai/short-courses/quantization-fundamentals-with-hugging-face/)
-* [Quantization in Depth](https://www.deeplearning.ai/short-courses/quantization-in-depth/)
-
-</Tip>
-
-## When to use what?
-
-This section will be expanded once Diffusers has multiple quantization backends. Currently, we only support `bitsandbytes`. [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques. 
@@ -183,7 +183,7 @@ Add the transformer model to the pipeline for denoising, but set the other model

 ```py
 pipeline = FluxPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
+    "black-forest-labs/FLUX.1-dev", ,
    text_encoder=None,
    text_encoder_2=None,
    tokenizer=None,
@@ -96,7 +96,7 @@ Please keep the following points in mind:
 To activate pivotal tuning for both encoders, add the flag `--enable_t5_ti`. 
 * When not fine-tuning the text encoders, we ALWAYS precompute the text embeddings to save memory.
 * **pure textual inversion** - to support the full range from pivotal tuning to textual inversion we introduce `--train_transformer_frac` which controls the amount of epochs the transformer LoRA layers are trained. By default, `--train_transformer_frac==1`, to trigger a textual inversion run set `--train_transformer_frac==0`. Values between 0 and 1 are supported as well, and we welcome the community to experiment w/ different settings and share the results!
-* **token initializer** - similar to the original textual inversion work, you can specify a concept of your choosing as the starting point for training. By default, when enabling `--train_text_encoder_ti`, the new inserted tokens are initialized randomly. You can specify a token in `--initializer_concept` such that the starting point for the trained embeddings will be the embeddings associated with your chosen `--initializer_concept`.
+* **token initializer** - similar to the original textual inversion work, you can specify a token of your choosing as the starting point for training. By default, when enabling `--train_text_encoder_ti`, the new inserted tokens are initialized randomly. You can specify a token in `--initializer_token` such that the starting point for the trained embeddings will be the embeddings associated with your chosen `--initializer_token`.

 ## Training examples

@@ -147,6 +147,7 @@ accelerate launch train_dreambooth_lora_flux_advanced.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
  --dataset_name=$DATASET_NAME \
  --instance_prompt="3d icon in the style of TOK" \
+  --validation_prompt="a TOK icon of an astronaut riding a horse, in the style of TOK" \
  --output_dir=$OUTPUT_DIR \
  --caption_column="prompt" \
  --mixed_precision="bf16" \
@@ -164,7 +165,7 @@ accelerate launch train_dreambooth_lora_flux_advanced.py \
  --lr_scheduler="constant" \
  --lr_warmup_steps=0 \
  --rank=8 \
-  --max_train_steps=700 \
+  --max_train_steps=1000 \
  --checkpointing_steps=2000 \
  --seed="0" \
  --push_to_hub
@@ -189,6 +190,7 @@ accelerate launch train_dreambooth_lora_flux_advanced.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
  --dataset_name=$DATASET_NAME \
  --instance_prompt="3d icon in the style of TOK" \
+  --validation_prompt="a TOK icon of an astronaut riding a horse, in the style of TOK" \
  --output_dir=$OUTPUT_DIR \
  --caption_column="prompt" \
  --mixed_precision="bf16" \
@@ -207,7 +209,7 @@ accelerate launch train_dreambooth_lora_flux_advanced.py \
  --lr_scheduler="constant" \
  --lr_warmup_steps=0 \
  --rank=8 \
-  --max_train_steps=700 \
+  --max_train_steps=1000 \
  --checkpointing_steps=2000 \
  --seed="0" \
  --push_to_hub
@@ -227,6 +229,7 @@ accelerate launch train_dreambooth_lora_flux_advanced.py \
  --pretrained_model_name_or_path=$MODEL_NAME \
  --dataset_name=$DATASET_NAME \
  --instance_prompt="3d icon in the style of TOK" \
+  --validation_prompt="a TOK icon of an astronaut riding a horse, in the style of TOK" \
  --output_dir=$OUTPUT_DIR \
  --caption_column="prompt" \
  --mixed_precision="bf16" \
@@ -246,7 +249,7 @@ accelerate launch train_dreambooth_lora_flux_advanced.py \
  --lr_scheduler="constant" \
  --lr_warmup_steps=0 \
  --rank=8 \
-  --max_train_steps=700 \
+  --max_train_steps=1000 \
  --checkpointing_steps=2000 \
  --seed="0" \
  --push_to_hub
@@ -270,9 +273,8 @@ pipe = AutoPipelineForText2Image.from_pretrained("black-forest-labs/FLUX.1-dev",
 pipe.load_lora_weights(repo_id, weight_name="pytorch_lora_weights.safetensors")
 ```
 2. now we load the pivotal tuning embeddings 
-> [!NOTE] #1 if `--enable_t5_ti` wasn't passed, we only load the embeddings to the CLIP encoder.
+💡note that if you didn't enable `--enable_t5_ti`, you only load the embeddings to the CLIP encoder

-> [!NOTE] #2 the number of tokens (i.e. <s0>,...,<si>) is either determined by `--num_new_tokens_per_abstraction` or by `--initializer_concept`. Make sure to update inference code accordingly :)
 ```python
 text_encoders = [pipe.text_encoder, pipe.text_encoder_2]
 tokenizers = [pipe.tokenizer, pipe.tokenizer_2]
@@ -74,7 +74,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -1650,8 +1650,6 @@ def main(args):
                elif isinstance(model, type(unwrap_model(text_encoder_one))):
                    if args.train_text_encoder:  # when --train_text_encoder_ti we don't save the layers
                        text_encoder_one_lora_layers_to_save = get_peft_model_state_dict(model)
-                elif isinstance(model, type(unwrap_model(text_encoder_two))):
-                    pass  # when --train_text_encoder_ti and --enable_t5_ti we don't save the layers
                else:
                    raise ValueError(f"unexpected save model: {model.__class__}")

@@ -1778,10 +1776,15 @@ def main(args):
        if not args.enable_t5_ti:
            # pure textual inversion - only clip
            if pure_textual_inversion:
-                params_to_optimize = [text_parameters_one_with_lr]
+                params_to_optimize = [
+                    text_parameters_one_with_lr,
+                ]
                te_idx = 0
            else:  # regular te training or regular pivotal for clip
-                params_to_optimize = [transformer_parameters_with_lr, text_parameters_one_with_lr]
+                params_to_optimize = [
+                    transformer_parameters_with_lr,
+                    text_parameters_one_with_lr,
+                ]
                te_idx = 1
        elif args.enable_t5_ti:
            # pivotal tuning of clip & t5
@@ -1804,7 +1807,9 @@ def main(args):
                ]
                te_idx = 1
    else:
-        params_to_optimize = [transformer_parameters_with_lr]
+        params_to_optimize = [
+            transformer_parameters_with_lr,
+        ]

    # Optimizer creation
    if not (args.optimizer.lower() == "prodigy" or args.optimizer.lower() == "adamw"):
@@ -1864,6 +1869,7 @@ def main(args):
            params_to_optimize[-1]["lr"] = args.learning_rate
        optimizer = optimizer_class(
            params_to_optimize,
+            lr=args.learning_rate,
            betas=(args.adam_beta1, args.adam_beta2),
            beta3=args.prodigy_beta3,
            weight_decay=args.adam_weight_decay,
@@ -2192,8 +2198,8 @@ def main(args):

                latent_image_ids = FluxPipeline._prepare_latent_image_ids(
                    model_input.shape[0],
-                    model_input.shape[2] // 2,
-                    model_input.shape[3] // 2,
+                    model_input.shape[2],
+                    model_input.shape[3],
                    accelerator.device,
                    weight_dtype,
                )
@@ -2247,8 +2253,8 @@ def main(args):
                )[0]
                model_pred = FluxPipeline._unpack_latents(
                    model_pred,
-                    height=model_input.shape[2] * vae_scale_factor,
-                    width=model_input.shape[3] * vae_scale_factor,
+                    height=int(model_input.shape[2] * vae_scale_factor / 2),
+                    width=int(model_input.shape[3] * vae_scale_factor / 2),
                    vae_scale_factor=vae_scale_factor,
                )

@@ -67,12 +67,11 @@ from diffusers.utils import (
    convert_state_dict_to_kohya,
    is_wandb_available,
 )
-from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
 from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -80,27 +79,30 @@ logger = get_logger(__name__)
 def save_model_card(
    repo_id: str,
    use_dora: bool,
-    images: list = None,
-    base_model: str = None,
+    images=None,
+    base_model=str,
    train_text_encoder=False,
    train_text_encoder_ti=False,
    token_abstraction_dict=None,
-    instance_prompt=None,
-    validation_prompt=None,
+    instance_prompt=str,
+    validation_prompt=str,
    repo_folder=None,
    vae_path=None,
 ):
+    img_str = "widget:\n"
    lora = "lora" if not use_dora else "dora"
-
-    widget_dict = []
-    if images is not None:
-        for i, image in enumerate(images):
-            image.save(os.path.join(repo_folder, f"image_{i}.png"))
-            widget_dict.append(
-                {"text": validation_prompt if validation_prompt else " ", "output": {"url": f"image_{i}.png"}}
-            )
-    else:
-        widget_dict.append({"text": instance_prompt})
+    for i, image in enumerate(images):
+        image.save(os.path.join(repo_folder, f"image_{i}.png"))
+        img_str += f"""
+        - text: '{validation_prompt if validation_prompt else ' ' }'
+          output:
+            url:
+                "image_{i}.png"
+        """
+    if not images:
+        img_str += f"""
+        - text: '{instance_prompt}'
+        """
    embeddings_filename = f"{repo_folder}_emb"
    instance_prompt_webui = re.sub(r"<s\d+>", "", re.sub(r"<s\d+>", embeddings_filename, instance_prompt, count=1))
    ti_keys = ", ".join(f'"{match}"' for match in re.findall(r"<s\d+>", instance_prompt))
@@ -135,7 +137,24 @@ pipeline.load_textual_inversion(state_dict["clip_l"], token=[{ti_keys}], text_en
                trigger_str += f"""
 to trigger concept `{key}` → use `{tokens}` in your prompt \n
 """
-    model_description = f"""
+
+    yaml = f"""---
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- diffusers-training
+- text-to-image
+- diffusers
+- {lora}
+- template:sd-lora
+{img_str}
+base_model: {base_model}
+instance_prompt: {instance_prompt}
+license: openrail++
+---
+"""
+
+    model_card = f"""
 # SD1.5 LoRA DreamBooth - {repo_id}

 <Gallery />
@@ -183,28 +202,8 @@ Pivotal tuning was enabled: {train_text_encoder_ti}.
 Special VAE used for training: {vae_path}.

 """
-    model_card = load_or_create_model_card(
-        repo_id_or_path=repo_id,
-        from_training=True,
-        license="openrail++",
-        base_model=base_model,
-        prompt=instance_prompt,
-        model_description=model_description,
-        inference=True,
-        widget=widget_dict,
-    )
-
-    tags = [
-        "text-to-image",
-        "diffusers",
-        "diffusers-training",
-        lora,
-        "template:sd-lora" "stable-diffusion",
-        "stable-diffusion-diffusers",
-    ]
-    model_card = populate_model_card(model_card, tags=tags)
-
-    model_card.save(os.path.join(repo_folder, "README.md"))
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)


 def import_model_class_from_model_name_or_path(
@@ -1359,7 +1358,10 @@ def main(args):
            else args.adam_weight_decay,
            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
        }
-        params_to_optimize = [unet_lora_parameters_with_lr, text_lora_parameters_one_with_lr]
+        params_to_optimize = [
+            unet_lora_parameters_with_lr,
+            text_lora_parameters_one_with_lr,
+        ]
    else:
        params_to_optimize = [unet_lora_parameters_with_lr]

@@ -1421,6 +1423,7 @@ def main(args):

        optimizer = optimizer_class(
            params_to_optimize,
+            lr=args.learning_rate,
            betas=(args.adam_beta1, args.adam_beta2),
            beta3=args.prodigy_beta3,
            weight_decay=args.adam_weight_decay,
@@ -79,7 +79,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -1794,6 +1794,7 @@ def main(args):

        optimizer = optimizer_class(
            params_to_optimize,
+            lr=args.learning_rate,
            betas=(args.adam_beta1, args.adam_beta2),
            beta3=args.prodigy_beta3,
            weight_decay=args.adam_weight_decay,
@@ -61,7 +61,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -947,6 +947,7 @@ def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):

        optimizer = optimizer_class(
            params_to_optimize,
+            lr=args.learning_rate,
            betas=(args.adam_beta1, args.adam_beta2),
            beta3=args.prodigy_beta3,
            weight_decay=args.adam_weight_decay,
@@ -52,7 +52,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -969,6 +969,7 @@ def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):

        optimizer = optimizer_class(
            params_to_optimize,
+            lr=args.learning_rate,
            betas=(args.adam_beta1, args.adam_beta2),
            beta3=args.prodigy_beta3,
            weight_decay=args.adam_weight_decay,
@@ -10,7 +10,6 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif

 | Example                                                                                                                               | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              | Code Example                                                                              | Colab                                                                                                                                                                                                              |                                                        Author |
 |:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
-|Adaptive Mask Inpainting|Adaptive Mask Inpainting algorithm from [Beyond the Contact: Discovering Comprehensive Affordance for 3D Objects from Pre-trained 2D Diffusion Models](https://github.com/snuvclab/coma) (ECCV '24, Oral) provides a way to insert human inside the scene image without altering the background, by inpainting with adapting mask.|[Adaptive Mask Inpainting](#adaptive-mask-inpainting)|-|[Hyeonwoo Kim](https://sshowbiz.xyz),[Sookwan Han](https://jellyheadandrew.github.io)|
 |Flux with CFG|[Flux with CFG](https://github.com/ToTheBeginning/PuLID/blob/main/docs/pulid_for_flux.md) provides an implementation of using CFG in [Flux](https://blackforestlabs.ai/announcing-black-forest-labs/).|[Flux with CFG](#flux-with-cfg)|NA|[Linoy Tsaban](https://github.com/linoytsaban), [Apolinário](https://github.com/apolinario), and [Sayak Paul](https://github.com/sayakpaul)|
 |Differential Diffusion|[Differential Diffusion](https://github.com/exx8/differential-diffusion) modifies an image according to a text prompt, and according to a map that specifies the amount of change in each region.|[Differential Diffusion](#differential-diffusion)|[![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/exx8/differential-diffusion) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/exx8/differential-diffusion/blob/main/examples/SD2.ipynb)|[Eran Levin](https://github.com/exx8) and [Ohad Fried](https://www.ohadf.com/)|
 | HD-Painter                                                                                                                            | [HD-Painter](https://github.com/Picsart-AI-Research/HD-Painter) enables prompt-faithfull and high resolution (up to 2k) image inpainting upon any diffusion-based image inpainting method.                                                                                                                                                                                                                                                                                                               | [HD-Painter](#hd-painter)                                                                 | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/PAIR/HD-Painter)                                                                              | [Manukyan Hayk](https://github.com/haikmanukyan) and [Sargsyan Andranik](https://github.com/AndranikSargsyan) |
@@ -74,7 +73,6 @@ Please also check out our [Community Scripts](https://github.com/huggingface/dif
 | Stable Diffusion BoxDiff Pipeline | Training-free controlled generation with bounding boxes using [BoxDiff](https://github.com/showlab/BoxDiff) | [Stable Diffusion BoxDiff Pipeline](#stable-diffusion-boxdiff) | - | [Jingyang Zhang](https://github.com/zjysteven/) |
 |   FRESCO V2V Pipeline                                                                                                    | Implementation of [[CVPR 2024] FRESCO: Spatial-Temporal Correspondence for Zero-Shot Video Translation](https://arxiv.org/abs/2403.12962)                                                                                                                                                                                                                                                                                                                                                                                                                                      | [FRESCO V2V Pipeline](#fresco)      | - |              [Yifan Zhou](https://github.com/SingleZombie) |
 | AnimateDiff IPEX Pipeline | Accelerate AnimateDiff inference pipeline with BF16/FP32 precision on Intel Xeon CPUs with [IPEX](https://github.com/intel/intel-extension-for-pytorch) | [AnimateDiff on IPEX](#animatediff-on-ipex) | - | [Dan Li](https://github.com/ustcuna/) |
-PIXART-α Controlnet pipeline | Implementation of the controlnet model for pixart alpha and its diffusers pipeline | [PIXART-α Controlnet pipeline](#pixart-α-controlnet-pipeline) | - | [Raul Ciotescu](https://github.com/raulc0399/) |
 | HunyuanDiT Differential Diffusion Pipeline | Applies [Differential Diffusion](https://github.com/exx8/differential-diffusion) to [HunyuanDiT](https://github.com/huggingface/diffusers/pull/8240). | [HunyuanDiT with Differential Diffusion](#hunyuandit-with-differential-diffusion) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1v44a5fpzyr4Ffr4v2XBQ7BajzG874N4P?usp=sharing) | [Monjoy Choudhury](https://github.com/MnCSSJ4x) |
 | [🪆Matryoshka Diffusion Models](https://huggingface.co/papers/2310.15111) | A diffusion process that denoises inputs at multiple resolutions jointly and uses a NestedUNet architecture where features and parameters for small scale inputs are nested within those of the large scales. See [original codebase](https://github.com/apple/ml-mdm). | [🪆Matryoshka Diffusion Models](#matryoshka-diffusion-models) | [![Hugging Face Space](https://img.shields.io/badge/🤗%20Hugging%20Face-Space-yellow)](https://huggingface.co/spaces/pcuenq/mdm) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/gist/tolgacangoz/1f54875fc7aeaabcf284ebde64820966/matryoshka_hf.ipynb) | [M. Tolga Cangöz](https://github.com/tolgacangoz) |

@@ -86,161 +84,6 @@ pipe = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion

 ## Example usages

-### Adaptive Mask Inpainting
-
-**Hyeonwoo Kim\*, Sookwan Han\*, Patrick Kwon, Hanbyul Joo**
-
-**Seoul National University, Naver Webtoon**
-
-Adaptive Mask Inpainting, presented in the ECCV'24 oral paper [*Beyond the Contact: Discovering Comprehensive Affordance for 3D Objects from Pre-trained 2D Diffusion Models*](https://snuvclab.github.io/coma), is an algorithm designed to insert humans into scene images without altering the background. Traditional inpainting methods often fail to preserve object geometry and details within the masked region, leading to false affordances. Adaptive Mask Inpainting addresses this issue by progressively specifying the inpainting region over diffusion timesteps, ensuring that the inserted human integrates seamlessly with the existing scene.
-
-Here is the demonstration of Adaptive Mask Inpainting:
-
-<video controls>
-  <source src="https://snuvclab.github.io/coma/static/videos/adaptive_mask_inpainting_vis.mp4" type="video/mp4">
-  Your browser does not support the video tag.
-</video>
-
-![teaser-img](https://snuvclab.github.io/coma/static/images/example_result_adaptive_mask_inpainting.png)
-
-
-You can find additional information about Adaptive Mask Inpainting in the [paper](https://arxiv.org/pdf/2401.12978) or in the [project website](https://snuvclab.github.io/coma).
-
-#### Usage example
-First, clone the diffusers github repository, and run the following command to set environment.
-```Shell
-git clone https://github.com/huggingface/diffusers.git
-cd diffusers
-
-conda create --name ami python=3.9 -y
-conda activate ami
-
-conda install pytorch==1.10.1 torchvision==0.11.2 torchaudio==0.10.1 cudatoolkit=11.3 -c pytorch -c conda-forge -y
-python -m pip install detectron2==0.6 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu113/torch1.10/index.html
-pip install easydict
-pip install diffusers==0.20.2 accelerate safetensors transformers
-pip install setuptools==59.5.0
-pip install opencv-python
-pip install numpy==1.24.1
-```
-Then, run the below code under 'diffusers' directory.
-```python
-import numpy as np
-import torch
-from PIL import Image
-
-from diffusers import DDIMScheduler
-from diffusers import DiffusionPipeline
-from diffusers.utils import load_image
-
-from examples.community.adaptive_mask_inpainting import download_file, AdaptiveMaskInpaintPipeline, AMI_INSTALL_MESSAGE
-
-print(AMI_INSTALL_MESSAGE)
-
-from easydict import EasyDict
-
-
-
-if __name__ == "__main__":    
-    """
-    Download Necessary Files
-    """
-    download_file(
-        url = "https://huggingface.co/datasets/jellyheadnadrew/adaptive-mask-inpainting-test-images/resolve/main/model_final_edd263.pkl?download=true",
-        output_file = "model_final_edd263.pkl",
-        exist_ok=True,
-    )
-    download_file(
-        url = "https://huggingface.co/datasets/jellyheadnadrew/adaptive-mask-inpainting-test-images/resolve/main/pointrend_rcnn_R_50_FPN_3x_coco.yaml?download=true",
-        output_file = "pointrend_rcnn_R_50_FPN_3x_coco.yaml",
-        exist_ok=True,
-    )
-    download_file(
-        url = "https://huggingface.co/datasets/jellyheadnadrew/adaptive-mask-inpainting-test-images/resolve/main/input_img.png?download=true",
-        output_file = "input_img.png",
-        exist_ok=True,
-    )
-    download_file(
-        url = "https://huggingface.co/datasets/jellyheadnadrew/adaptive-mask-inpainting-test-images/resolve/main/input_mask.png?download=true",
-        output_file = "input_mask.png",
-        exist_ok=True,
-    )
-    download_file(
-        url = "https://huggingface.co/datasets/jellyheadnadrew/adaptive-mask-inpainting-test-images/resolve/main/Base-PointRend-RCNN-FPN.yaml?download=true",
-        output_file = "Base-PointRend-RCNN-FPN.yaml",
-        exist_ok=True,
-    )
-    download_file(
-        url = "https://huggingface.co/datasets/jellyheadnadrew/adaptive-mask-inpainting-test-images/resolve/main/Base-RCNN-FPN.yaml?download=true",
-        output_file = "Base-RCNN-FPN.yaml",
-        exist_ok=True,
-    )
-    
-    """ 
-    Prepare Adaptive Mask Inpainting Pipeline
-    """
-    # device
-    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-    num_steps = 50
-    
-    # Scheduler
-    scheduler = DDIMScheduler(
-        beta_start=0.00085, 
-        beta_end=0.012, 
-        beta_schedule="scaled_linear", 
-        clip_sample=False, 
-        set_alpha_to_one=False
-    )
-    scheduler.set_timesteps(num_inference_steps=num_steps)
-
-    ## load models as pipelines
-    pipeline = AdaptiveMaskInpaintPipeline.from_pretrained(
-        "Uminosachi/realisticVisionV51_v51VAE-inpainting", 
-        scheduler=scheduler, 
-        torch_dtype=torch.float16, 
-        requires_safety_checker=False
-    ).to(device)
-
-    ## disable safety checker
-    enable_safety_checker = False
-    if not enable_safety_checker:
-        pipeline.safety_checker = None
-    
-    """ 
-    Run Adaptive Mask Inpainting 
-    """
-    default_mask_image = Image.open("./input_mask.png").convert("L")
-    init_image = Image.open("./input_img.png").convert("RGB")
-    
-    
-    seed = 59
-    generator = torch.Generator(device=device)
-    generator.manual_seed(seed)
-    
-    image = pipeline(
-        prompt="a man sitting on a couch",
-        negative_prompt="worst quality, normal quality, low quality, bad anatomy, artifacts, blurry, cropped, watermark, greyscale, nsfw",
-        image=init_image,
-        default_mask_image=default_mask_image,
-        guidance_scale=11.0,
-        strength=0.98,
-        use_adaptive_mask=True,
-        generator=generator,
-        enforce_full_mask_ratio=0.0,
-        visualization_save_dir="./ECCV2024_adaptive_mask_inpainting_demo", # DON'T CHANGE THIS!!!
-        human_detection_thres=0.015,
-    ).images[0]
-
-    
-    image.save(f'final_img.png')
-```
-#### [Troubleshooting]
-
-If you run into an error `cannot import name 'cached_download' from 'huggingface_hub'` (issue [1851](https://github.com/easydiffusion/easydiffusion/issues/1851)), remove `cached_download` from the import line in the file `diffusers/utils/dynamic_modules_utils.py`. 
-
-For example, change the import line from `.../env/lib/python3.8/site-packages/diffusers/utils/dynamic_modules_utils.py`.
-
-
 ### Flux with CFG

 Know more about Flux [here](https://blackforestlabs.ai/announcing-black-forest-labs/). Since Flux doesn't use CFG, this implementation provides one, inspired by the [PuLID Flux adaptation](https://github.com/ToTheBeginning/PuLID/blob/main/docs/pulid_for_flux.md).
@@ -4493,19 +4336,19 @@ The Abstract of the paper:

 **64x64**
 :-------------------------:
-| <img src="https://github.com/user-attachments/assets/032738eb-c6cd-4fd9-b4d7-a7317b4b6528" width="222" height="222" alt="bird_64_64"> |
+| <img src="https://github.com/user-attachments/assets/9e7bb2cd-45a0-4bd1-adb8-23e283baed39" width="222" height="222" alt="bird_64"> |

 - `256×256, nesting_level=1`: 1.776 GiB. With `150` DDIM inference steps:

 **64x64**             |  **256x256**
 :-------------------------:|:-------------------------:
-| <img src="https://github.com/user-attachments/assets/21b9ad8b-eea6-4603-80a2-31180f391589" width="222" height="222" alt="bird_256_64"> | <img src="https://github.com/user-attachments/assets/fc411682-8a36-422c-9488-395b77d4406e" width="222" height="222" alt="bird_256_256"> |
+| <img src="https://github.com/user-attachments/assets/6b724c2e-5e6a-4b63-9b65-c1182cbb67e0" width="222" height="222" alt="64x64"> | <img src="https://github.com/user-attachments/assets/7dbab2ad-bf40-4a73-ab04-f178347cb7d5" width="222" height="222" alt="256x256"> |

- `1024×1024, nesting_level=2`: 1.792 GiB. As one can realize the cost of adding another layer is really negligible in this context! With `250` DDIM inference steps:
+- `1024×1024, nesting_level=2`: 1.792 GiB. As one can realize the cost of adding another layer is really negligible. With `250` DDIM inference steps:

 **64x64**             |  **256x256**  |  **1024x1024**
 :-------------------------:|:-------------------------:|:-------------------------:
-| <img src="https://github.com/user-attachments/assets/febf4b98-3dee-4a8e-9946-fd42e1f232e6" width="222" height="222" alt="bird_1024_64"> | <img src="https://github.com/user-attachments/assets/c5f85b40-5d6d-4267-a92a-c89dff015b9b" width="222" height="222" alt="bird_1024_256"> | <img src="https://github.com/user-attachments/assets/ad66b913-4367-4cb9-889e-bc06f4d96148" width="222" height="222" alt="bird_1024_1024"> |
+| <img src="https://github.com/user-attachments/assets/4a9454e4-e20a-4736-a196-270e2ae796c0" width="222" height="222" alt="64x64"> | <img src="https://github.com/user-attachments/assets/4a96555d-0fda-4303-82b1-a4d886f770b9" width="222" height="222" alt="256x256"> | <img src="https://github.com/user-attachments/assets/e0239b7a-ab73-4d45-8f3e-b4e6b4b50abe" width="222" height="222" alt="1024x1024"> |

 ```py
 from diffusers import DiffusionPipeline
@@ -4519,7 +4362,8 @@ pipe = DiffusionPipeline.from_pretrained("tolgacangoz/matryoshka-diffusion-model

 prompt0 = "a blue jay stops on the top of a helmet of Japanese samurai, background with sakura tree"
 prompt = f"breathtaking {prompt0}. award-winning, professional, highly detailed"
-image = pipe(prompt, num_inference_steps=50).images
+negative_prompt = "deformed, mutated, ugly, disfigured, blur, blurry, noise, noisy"
+image = pipe(prompt=prompt, negative_prompt=negative_prompt, num_inference_steps=50).images
 make_image_grid(image, rows=1, cols=len(image))

 # pipe.change_nesting_level(<int>)  # 0, 1, or 2
@@ -4602,94 +4446,3 @@ grid_image.save(grid_dir + "sample.png")
 `pag_scale` : guidance scale of PAG (ex: 5.0)

 `pag_applied_layers_index` : index of the layer to apply perturbation (ex: ['m0'])
-
-# PIXART-α Controlnet pipeline
-
-[Project](https://pixart-alpha.github.io/) / [GitHub](https://github.com/PixArt-alpha/PixArt-alpha/blob/master/asset/docs/pixart_controlnet.md)
-
-This the implementation of the controlnet model and the pipelne for the Pixart-alpha model, adapted to use the HuggingFace Diffusers.
-
-## Example Usage
-
-This example uses the Pixart HED Controlnet model, converted from the control net model as trained by the authors of the paper.
-
-```py
-import sys
-import os
-import torch
-import torchvision.transforms as T
-import torchvision.transforms.functional as TF
-
-from pipeline_pixart_alpha_controlnet import PixArtAlphaControlnetPipeline
-from diffusers.utils import load_image
-
-from diffusers.image_processor import PixArtImageProcessor
-
-from controlnet_aux import HEDdetector
-
-sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from pixart.controlnet_pixart_alpha import PixArtControlNetAdapterModel
-
-controlnet_repo_id = "raulc0399/pixart-alpha-hed-controlnet"
-
-weight_dtype = torch.float16
-image_size = 1024
-
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-torch.manual_seed(0)
-
-# load controlnet
-controlnet = PixArtControlNetAdapterModel.from_pretrained(
-    controlnet_repo_id,
-    torch_dtype=weight_dtype,
-    use_safetensors=True,
-).to(device)
-
-pipe = PixArtAlphaControlnetPipeline.from_pretrained(
-    "PixArt-alpha/PixArt-XL-2-1024-MS",
-    controlnet=controlnet,
-    torch_dtype=weight_dtype,
-    use_safetensors=True,
-).to(device)
-
-images_path = "images"
-control_image_file = "0_7.jpg"
-
-prompt = "battleship in space, galaxy in background"
-
-control_image_name = control_image_file.split('.')[0]
-
-control_image = load_image(f"{images_path}/{control_image_file}")
-print(control_image.size)
-height, width = control_image.size
-
-hed = HEDdetector.from_pretrained("lllyasviel/Annotators")
-
-condition_transform = T.Compose([
-    T.Lambda(lambda img: img.convert('RGB')),
-    T.CenterCrop([image_size, image_size]),
-])
-
-control_image = condition_transform(control_image)
-hed_edge = hed(control_image, detect_resolution=image_size, image_resolution=image_size)
-
-hed_edge.save(f"{images_path}/{control_image_name}_hed.jpg")
-
-# run pipeline
-with torch.no_grad():
-    out = pipe(
-        prompt=prompt,
-        image=hed_edge,
-        num_inference_steps=14,
-        guidance_scale=4.5,
-        height=image_size,
-        width=image_size,
-    )
-
-    out.images[0].save(f"{images_path}//{control_image_name}_output.jpg")
-    
-```
-
-In the folder examples/pixart there is also a script that can be used to train new models.
-Please check the script `train_controlnet_hf_diffusers.sh` on how to start the training.
@@ -8,7 +8,6 @@ If a community script doesn't work as expected, please open an issue and ping th
 |:--------------------------------------------------------------------------------------------------------------------------------------|:---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|:------------------------------------------------------------------------------------------|:-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------:|
 | Using IP-Adapter with negative noise                                                                                                  | Using negative noise with IP-adapter to better control the generation (see the [original post](https://github.com/huggingface/diffusers/discussions/7167) on the forum for more details)                                                                                                                                                                                                                                                    | [IP-Adapter Negative Noise](#ip-adapter-negative-noise)                                   | | [Álvaro Somoza](https://github.com/asomoza)|
 | asymmetric tiling                                                                                                  |configure seamless image tiling independently for the X and Y axes                                                                                                                                                                                                      | [Asymmetric Tiling](#asymmetric-tiling )                                   | | [alexisrolland](https://github.com/alexisrolland)|
-| Prompt scheduling callback                                                                                                  |Allows changing prompts during a generation                                                                                                                                                                                                      | [Prompt Scheduling](#prompt-scheduling )                                   | | [hlky](https://github.com/hlky)|


 ## Example usages
@@ -230,86 +229,4 @@ seamless_tiling(pipeline=pipeline, x_axis=False, y_axis=False)

 torch.cuda.empty_cache()
 image.save('image.png')
-```
-
-### Prompt Scheduling callback
-
-Prompt scheduling callback allows changing prompts during a generation, like [prompt editing in A1111](https://github.com/AUTOMATIC1111/stable-diffusion-webui/wiki/Features#prompt-editing)
-
-```python
-from diffusers import StableDiffusionPipeline
-from diffusers.callbacks import PipelineCallback, MultiPipelineCallbacks
-from diffusers.configuration_utils import register_to_config
-import torch
-from typing import Any, Dict, Optional
-
-
-pipeline: StableDiffusionPipeline = StableDiffusionPipeline.from_pretrained(
-    "stable-diffusion-v1-5/stable-diffusion-v1-5",
-    torch_dtype=torch.float16,
-    variant="fp16",
-    use_safetensors=True,
-).to("cuda")
-pipeline.safety_checker = None
-pipeline.requires_safety_checker = False
-
-
-class SDPromptScheduleCallback(PipelineCallback):
-    @register_to_config
-    def __init__(
-        self,
-        prompt: str,
-        negative_prompt: Optional[str] = None,
-        num_images_per_prompt: int = 1,
-        cutoff_step_ratio=1.0,
-        cutoff_step_index=None,
-    ):
-        super().__init__(
-            cutoff_step_ratio=cutoff_step_ratio, cutoff_step_index=cutoff_step_index
-        )
-
-    tensor_inputs = ["prompt_embeds"]
-
-    def callback_fn(
-        self, pipeline, step_index, timestep, callback_kwargs
-    ) -> Dict[str, Any]:
-        cutoff_step_ratio = self.config.cutoff_step_ratio
-        cutoff_step_index = self.config.cutoff_step_index
-
-        # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
-        cutoff_step = (
-            cutoff_step_index
-            if cutoff_step_index is not None
-            else int(pipeline.num_timesteps * cutoff_step_ratio)
-        )
-
-        if step_index == cutoff_step:
-            prompt_embeds, negative_prompt_embeds = pipeline.encode_prompt(
-                prompt=self.config.prompt,
-                negative_prompt=self.config.negative_prompt,
-                device=pipeline._execution_device,
-                num_images_per_prompt=self.config.num_images_per_prompt,
-                do_classifier_free_guidance=pipeline.do_classifier_free_guidance,
-            )
-            if pipeline.do_classifier_free_guidance:
-                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
-            callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
-        return callback_kwargs
-
-callback = MultiPipelineCallbacks(
-    [
-        SDPromptScheduleCallback(
-            prompt="Official portrait of a smiling world war ii general, female, cheerful, happy, detailed face, 20th century, highly detailed, cinematic lighting, digital art painting by Greg Rutkowski",
-            negative_prompt="Deformed, ugly, bad anatomy",
-            cutoff_step_ratio=0.25,
-        )
-    ]
-)
-
-image = pipeline(
-    prompt="Official portrait of a smiling world war ii general, male, cheerful, happy, detailed face, 20th century, highly detailed, cinematic lighting, digital art painting by Greg Rutkowski",
-    negative_prompt="Deformed, ugly, bad anatomy",
-    callback_on_step_end=callback,
-    callback_on_step_end_tensor_inputs=["prompt_embeds"],
-).images[0]
-```
+```
@@ -43,7 +43,7 @@ from diffusers.utils import BaseOutput, check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")


 class MarigoldDepthOutput(BaseOutput):
@@ -107,16 +107,15 @@ EXAMPLE_DOC_STRING = """

        >>> # nesting_level=0 -> 64x64; nesting_level=1 -> 256x256 - 64x64; nesting_level=2 -> 1024x1024 - 256x256 - 64x64
        >>> pipe = DiffusionPipeline.from_pretrained("tolgacangoz/matryoshka-diffusion-models",
-        ...                                         nesting_level=0,
-        ...                                         trust_remote_code=False,  # One needs to give permission for this code to run
-        ...                                         ).to("cuda")
+        >>>                                          custom_pipeline="matryoshka").to("cuda")

        >>> prompt0 = "a blue jay stops on the top of a helmet of Japanese samurai, background with sakura tree"
        >>> prompt = f"breathtaking {prompt0}. award-winning, professional, highly detailed"
-        >>> image = pipe(prompt, num_inference_steps=50).images
+        >>> negative_prompt = "deformed, mutated, ugly, disfigured, blur, blurry, noise, noisy"
+        >>> image = pipe(prompt=prompt, negative_prompt=negative_prompt, num_inference_steps=50).images
        >>> make_image_grid(image, rows=1, cols=len(image))

-        >>> # pipe.change_nesting_level(<int>)  # 0, 1, or 2
+        >>> pipe.change_nesting_level(<int>)  # 0, 1, or 2
        >>> # 50+, 100+, and 250+ num_inference_steps are recommended for nesting levels 0, 1, and 2 respectively.
        ```
 """
@@ -421,7 +420,6 @@ class MatryoshkaDDIMScheduler(SchedulerMixin, ConfigMixin):
        self.timesteps = torch.from_numpy(np.arange(0, num_train_timesteps)[::-1].copy().astype(np.int64))

        self.scales = None
-        self.schedule_shifted_power = 1.0

    def scale_model_input(self, sample: torch.Tensor, timestep: Optional[int] = None) -> torch.Tensor:
        """
@@ -534,7 +532,6 @@ class MatryoshkaDDIMScheduler(SchedulerMixin, ConfigMixin):

    def get_schedule_shifted(self, alpha_prod, scale_factor=None):
        if (scale_factor is not None) and (scale_factor > 1):  # rescale noise schedule
-            scale_factor = scale_factor**self.schedule_shifted_power
            snr = alpha_prod / (1 - alpha_prod)
            scaled_snr = snr / scale_factor
            alpha_prod = 1 / (1 + 1 / scaled_snr)
@@ -642,14 +639,17 @@ class MatryoshkaDDIMScheduler(SchedulerMixin, ConfigMixin):
        # 4. Clip or threshold "predicted x_0"
        if self.config.thresholding:
            if len(model_output) > 1:
-                pred_original_sample = [self._threshold_sample(p_o_s) for p_o_s in pred_original_sample]
+                pred_original_sample = [
+                    self._threshold_sample(p_o_s * scale) / scale
+                    for p_o_s, scale in zip(pred_original_sample, self.scales)
+                ]
            else:
                pred_original_sample = self._threshold_sample(pred_original_sample)
        elif self.config.clip_sample:
            if len(model_output) > 1:
                pred_original_sample = [
-                    p_o_s.clamp(-self.config.clip_sample_range, self.config.clip_sample_range)
-                    for p_o_s in pred_original_sample
+                    (p_o_s * scale).clamp(-self.config.clip_sample_range, self.config.clip_sample_range) / scale
+                    for p_o_s, scale in zip(pred_original_sample, self.scales)
                ]
            else:
                pred_original_sample = pred_original_sample.clamp(
@@ -3816,8 +3816,6 @@ class MatryoshkaPipeline(

        if hasattr(unet, "nest_ratio"):
            scheduler.scales = unet.nest_ratio + [1]
-            if nesting_level == 2:
-                scheduler.schedule_shifted_power = 2.0

        self.register_modules(
            text_encoder=text_encoder,
@@ -3844,14 +3842,12 @@ class MatryoshkaPipeline(
            ).to(self.device)
            self.config.nesting_level = 1
            self.scheduler.scales = self.unet.nest_ratio + [1]
-            self.scheduler.schedule_shifted_power = 1.0
        elif nesting_level == 2:
            self.unet = NestedUNet2DConditionModel.from_pretrained(
                "tolgacangoz/matryoshka-diffusion-models", subfolder="unet/nesting_level_2"
            ).to(self.device)
            self.config.nesting_level = 2
            self.scheduler.scales = self.unet.nest_ratio + [1]
-            self.scheduler.schedule_shifted_power = 2.0
        else:
            raise ValueError("Currently, nesting levels 0, 1, and 2 are supported.")

@@ -4631,8 +4627,8 @@ class MatryoshkaPipeline(
        image = latents

        if self.scheduler.scales is not None:
-            for i, img in enumerate(image):
-                image[i] = self.image_processor.postprocess(img, output_type=output_type)[0]
+            for i, (img, scale) in enumerate(zip(image, self.scheduler.scales)):
+                image[i] = self.image_processor.postprocess(img * scale, output_type=output_type)[0]
        else:
            image = self.image_processor.postprocess(image, output_type=output_type)

@@ -73,7 +73,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -66,7 +66,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -79,7 +79,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -72,7 +72,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -78,7 +78,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -104,7 +104,7 @@ from diffusers.utils import load_image
 import torch

 base_model_path = "stabilityai/stable-diffusion-3-medium-diffusers"
-controlnet_path = "DavyMorgan/sd3-controlnet-out"
+controlnet_path = "sd3-controlnet-out/checkpoint-6500/controlnet"

 controlnet = SD3ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.float16)
 pipe = StableDiffusion3ControlNetPipeline.from_pretrained(
@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -1048,9 +1048,7 @@ def main(args):

                # Add noise to the latents according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
-                noisy_latents = noise_scheduler.add_noise(latents.float(), noise.float(), timesteps).to(
-                    dtype=weight_dtype
-                )
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

                # Get the text embedding for conditioning
                encoder_hidden_states = text_encoder(batch["input_ids"], return_dict=False)[0]
@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = logging.getLogger(__name__)

@@ -65,7 +65,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)
 if is_torch_npu_available():
@@ -152,7 +152,6 @@ def log_validation(
                    guidance_scale=3.5,
                    generator=generator,
                ).images[0]
-            image = image.resize((args.resolution, args.resolution))
            images.append(image)
        image_logs.append(
            {"validation_image": validation_image, "images": images, "validation_prompt": validation_prompt}
@@ -1257,8 +1256,8 @@ def main(args):

                latent_image_ids = FluxControlNetPipeline._prepare_latent_image_ids(
                    batch_size=pixel_latents_tmp.shape[0],
-                    height=pixel_latents_tmp.shape[2] // 2,
-                    width=pixel_latents_tmp.shape[3] // 2,
+                    height=pixel_latents_tmp.shape[2],
+                    width=pixel_latents_tmp.shape[3],
                    device=pixel_values.device,
                    dtype=pixel_values.dtype,
                )
@@ -50,7 +50,7 @@ from diffusers import (
 )
 from diffusers.optimization import get_scheduler
 from diffusers.training_utils import compute_density_for_timestep_sampling, compute_loss_weighting_for_sd3, free_memory
-from diffusers.utils import check_min_version, is_wandb_available, make_image_grid
+from diffusers.utils import check_min_version, is_wandb_available
 from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
 from diffusers.utils.torch_utils import is_compiled_module

@@ -59,11 +59,22 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.30.0.dev0")

 logger = get_logger(__name__)


+def image_grid(imgs, rows, cols):
+    assert len(imgs) == rows * cols
+
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+
+
 def log_validation(controlnet, args, accelerator, weight_dtype, step, is_final_validation=False):
    logger.info("Running validation... ")

@@ -213,7 +224,7 @@ def save_model_card(repo_id: str, image_logs=None, base_model=str, repo_folder=N
            validation_image.save(os.path.join(repo_folder, "image_control.png"))
            img_str += f"prompt: {validation_prompt}\n"
            images = [validation_image] + images
-            make_image_grid(images, 1, len(images)).save(os.path.join(repo_folder, f"images_{i}.png"))
+            image_grid(images, 1, len(images)).save(os.path.join(repo_folder, f"images_{i}.png"))
            img_str += f"![images_{i})](./images_{i}.png)\n"

    model_description = f"""
@@ -61,7 +61,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)
 if is_torch_npu_available():
@@ -1210,9 +1210,7 @@ def main(args):

                # Add noise to the latents according to the noise magnitude at each timestep
                # (this is the forward diffusion process)
-                noisy_latents = noise_scheduler.add_noise(latents.float(), noise.float(), timesteps).to(
-                    dtype=weight_dtype
-                )
+                noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)

                # ControlNet conditioning.
                controlnet_image = batch["conditioning_pixel_values"].to(dtype=weight_dtype)
@@ -63,7 +63,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -170,21 +170,6 @@ accelerate launch train_dreambooth_lora_flux.py \
  --push_to_hub
 ```

-### Target Modules
-When LoRA was first adapted from language models to diffusion models, it was applied to the cross-attention layers in the Unet that relate the image representations with the prompts that describe them. 
-More recently, SOTA text-to-image diffusion models replaced the Unet with a diffusion Transformer(DiT). With this change, we may also want to explore 
-applying LoRA training onto different types of layers and blocks. To allow more flexibility and control over the targeted modules we added `--lora_layers`- in which you can specify in a comma seperated string
-the exact modules for LoRA training. Here are some examples of target modules you can provide: 
- for attention only layers: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0"`
- to train the same modules as in the fal trainer: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2"`
- to train the same modules as in ostris ai-toolkit / replicate trainer: `--lora_blocks="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2,norm1_context.linear, norm1.linear,norm.linear,proj_mlp,proj_out"`
-> [!NOTE]
-> `--lora_layers` can also be used to specify which **blocks** to apply LoRA training to. To do so, simply add a block prefix to each layer in the comma seperated string:
-> **single DiT blocks**: to target the ith single transformer block, add the prefix `single_transformer_blocks.i`, e.g. - `single_transformer_blocks.i.attn.to_k`
-> **MMDiT blocks**: to target the ith MMDiT block, add the prefix `transformer_blocks.i`, e.g. - `transformer_blocks.i.attn.to_k` 
-> [!NOTE]
-> keep in mind that while training more layers can improve quality and expressiveness, it also increases the size of the output LoRA weights.
-
 ### Text Encoder Training

 Alongside the transformer, fine-tuning of the CLIP text encoder is also supported.
@@ -147,40 +147,6 @@ accelerate launch train_dreambooth_lora_sd3.py \
  --push_to_hub
 ```

-### Targeting Specific Blocks & Layers
-As image generation models get bigger & more powerful, more fine-tuners come to find that training only part of the 
-transformer blocks (sometimes as little as two) can be enough to get great results. 
-In some cases, it can be even better to maintain some of the blocks/layers frozen.
-
-For **SD3.5-Large** specifically, you may find this information useful (taken from: [Stable Diffusion 3.5 Large Fine-tuning Tutorial](https://stabilityai.notion.site/Stable-Diffusion-3-5-Large-Fine-tuning-Tutorial-11a61cdcd1968027a15bdbd7c40be8c6#12461cdcd19680788a23c650dab26b93):
-> [!NOTE]
-> A commonly believed heuristic that we verified once again during the construction of the SD3.5 family of models is that later/higher layers (i.e. `30 - 37`)* impact tertiary details more heavily. Conversely, earlier layers (i.e. `12 - 24` )* influence the overall composition/primary form more. 
-> So, freezing other layers/targeting specific layers is a viable approach.
-> `*`These suggested layers are speculative and not 100% guaranteed. The tips here are more or less a general idea for next steps.
-> **Photorealism**
-> In preliminary testing, we observed that freezing the last few layers of the architecture significantly improved model training when using a photorealistic dataset, preventing detail degradation introduced by small dataset from happening.
-> **Anatomy preservation**
-> To dampen any possible degradation of anatomy, training only the attention layers and **not** the adaptive linear layers could help. For reference, below is one of the transformer blocks.
-
-
-We've added `--lora_layers` and `--lora_blocks` to make LoRA training modules configurable. 
- with `--lora_blocks` you can specify the block numbers for training. E.g. passing - 
-```diff
--lora_blocks "12,13,14,15,16,17,18,19,20,21,22,23,24,30,31,32,33,34,35,36,37"
-```
-will trigger LoRA training of transformer blocks 12-24 and 30-37. By default, all blocks are trained. 
- with `--lora_layers` you can specify the types of layers you wish to train. 
-By default, the trained layers are -  
-`attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,attn.to_k,attn.to_out.0,attn.to_q,attn.to_v`
-If you wish to have a leaner LoRA / train more blocks over layers you could pass - 
-```diff
-+ --lora_layers attn.to_k,attn.to_q,attn.to_v,attn.to_out.0
-```
-This will reduce LoRA size by roughly 50% for the same rank compared to the default. 
-However, if you're after compact LoRAs, it's our impression that maintaining the default setting for `--lora_layers` and
-freezing some of the early & blocks is usually better. 
-
-
 ### Text Encoder Training
 Alongside the transformer, LoRA fine-tuning of the CLIP text encoders is now also supported.
 To do so, just specify `--train_text_encoder` while launching training. Please keep the following points in mind:
@@ -37,7 +37,6 @@ class DreamBoothLoRAFlux(ExamplesTestsAccelerate):
    instance_prompt = "photo"
    pretrained_model_name_or_path = "hf-internal-testing/tiny-flux-pipe"
    script_path = "examples/dreambooth/train_dreambooth_lora_flux.py"
-    transformer_layer_type = "single_transformer_blocks.0.attn.to_k"

    def test_dreambooth_lora_flux(self):
        with tempfile.TemporaryDirectory() as tmpdir:
@@ -137,43 +136,6 @@ class DreamBoothLoRAFlux(ExamplesTestsAccelerate):
            starts_with_transformer = all(key.startswith("transformer") for key in lora_state_dict.keys())
            self.assertTrue(starts_with_transformer)

-    def test_dreambooth_lora_layers(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                {self.script_path}
-                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
-                --instance_data_dir {self.instance_data_dir}
-                --instance_prompt {self.instance_prompt}
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --cache_latents
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lora_layers {self.transformer_layer_type}
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-            # when not training the text encoder, all the parameters in the state dict should start
-            # with `"transformer"` in their names. In this test, we only params of
-            # transformer.single_transformer_blocks.0.attn.to_k should be in the state dict
-            starts_with_transformer = all(
-                key.startswith("transformer.single_transformer_blocks.0.attn.to_k") for key in lora_state_dict.keys()
-            )
-            self.assertTrue(starts_with_transformer)
-
    def test_dreambooth_lora_flux_checkpointing_checkpoints_total_limit(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            test_args = f"""
@@ -38,9 +38,6 @@ class DreamBoothLoRASD3(ExamplesTestsAccelerate):
    pretrained_model_name_or_path = "hf-internal-testing/tiny-sd3-pipe"
    script_path = "examples/dreambooth/train_dreambooth_lora_sd3.py"

-    transformer_block_idx = 0
-    layer_type = "attn.to_k"
-
    def test_dreambooth_lora_sd3(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            test_args = f"""
@@ -139,74 +136,6 @@ class DreamBoothLoRASD3(ExamplesTestsAccelerate):
            starts_with_transformer = all(key.startswith("transformer") for key in lora_state_dict.keys())
            self.assertTrue(starts_with_transformer)

-    def test_dreambooth_lora_block(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                {self.script_path}
-                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
-                --instance_data_dir {self.instance_data_dir}
-                --instance_prompt {self.instance_prompt}
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --lora_blocks {self.transformer_block_idx}
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-            # when not training the text encoder, all the parameters in the state dict should start
-            # with `"transformer"` in their names.
-            # In this test, only params of transformer block 0 should be in the state dict
-            starts_with_transformer = all(
-                key.startswith("transformer.transformer_blocks.0") for key in lora_state_dict.keys()
-            )
-            self.assertTrue(starts_with_transformer)
-
-    def test_dreambooth_lora_layer(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
-            test_args = f"""
-                {self.script_path}
-                --pretrained_model_name_or_path {self.pretrained_model_name_or_path}
-                --instance_data_dir {self.instance_data_dir}
-                --instance_prompt {self.instance_prompt}
-                --resolution 64
-                --train_batch_size 1
-                --gradient_accumulation_steps 1
-                --max_train_steps 2
-                --lora_layers {self.layer_type}
-                --learning_rate 5.0e-04
-                --scale_lr
-                --lr_scheduler constant
-                --lr_warmup_steps 0
-                --output_dir {tmpdir}
-                """.split()
-
-            run_command(self._launch_args + test_args)
-            # save_pretrained smoke test
-            self.assertTrue(os.path.isfile(os.path.join(tmpdir, "pytorch_lora_weights.safetensors")))
-
-            # make sure the state_dict has the correct naming in the parameters.
-            lora_state_dict = safetensors.torch.load_file(os.path.join(tmpdir, "pytorch_lora_weights.safetensors"))
-            is_lora = all("lora" in k for k in lora_state_dict.keys())
-            self.assertTrue(is_lora)
-
-            # In this test, only transformer params of attention layers `attn.to_k` should be in the state dict
-            starts_with_transformer = all("attn.to_k" in key for key in lora_state_dict.keys())
-            self.assertTrue(starts_with_transformer)
-
    def test_dreambooth_lora_sd3_checkpointing_checkpoints_total_limit(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            test_args = f"""
@@ -63,7 +63,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -35,7 +35,7 @@ from diffusers.utils import check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 # Cache compiled models across invocations of this script.
 cc.initialize_cache(os.path.expanduser("~/.cache/jax/compilation_cache"))
@@ -57,7 +57,6 @@ from diffusers.utils import (
    is_wandb_available,
 )
 from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
-from diffusers.utils.import_utils import is_torch_npu_available
 from diffusers.utils.torch_utils import is_compiled_module


@@ -65,16 +64,10 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

-if is_torch_npu_available():
-    import torch_npu
-
-    torch.npu.config.allow_internal_format = False
-    torch.npu.set_compile_mode(jit_compile=False)
-

 def save_model_card(
    repo_id: str,
@@ -168,7 +161,7 @@ def log_validation(
        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
        f" {args.validation_prompt}."
    )
-    pipeline = pipeline.to(accelerator.device)
+    pipeline = pipeline.to(accelerator.device, dtype=torch_dtype)
    pipeline.set_progress_bar_config(disable=True)

    # run inference
@@ -196,8 +189,6 @@ def log_validation(
    del pipeline
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
-    elif is_torch_npu_available():
-        torch_npu.npu.empty_cache()

    return images

@@ -1044,9 +1035,7 @@ def main(args):
        cur_class_images = len(list(class_images_dir.iterdir()))

        if cur_class_images < args.num_class_images:
-            has_supported_fp16_accelerator = (
-                torch.cuda.is_available() or torch.backends.mps.is_available() or is_torch_npu_available()
-            )
+            has_supported_fp16_accelerator = torch.cuda.is_available() or torch.backends.mps.is_available()
            torch_dtype = torch.float16 if has_supported_fp16_accelerator else torch.float32
            if args.prior_generation_precision == "fp32":
                torch_dtype = torch.float32
@@ -1084,8 +1073,6 @@ def main(args):
            del pipeline
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
-            elif is_torch_npu_available():
-                torch_npu.npu.empty_cache()

    # Handle the repository creation
    if accelerator.is_main_process:
@@ -1239,7 +1226,10 @@ def main(args):
            "weight_decay": args.adam_weight_decay_text_encoder,
            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
        }
-        params_to_optimize = [transformer_parameters_with_lr, text_parameters_one_with_lr]
+        params_to_optimize = [
+            transformer_parameters_with_lr,
+            text_parameters_one_with_lr,
+        ]
    else:
        params_to_optimize = [transformer_parameters_with_lr]

@@ -1298,9 +1288,11 @@ def main(args):
            # changes the learning rate of text_encoder_parameters_one and text_encoder_parameters_two to be
            # --learning_rate
            params_to_optimize[1]["lr"] = args.learning_rate
+            params_to_optimize[2]["lr"] = args.learning_rate

        optimizer = optimizer_class(
            params_to_optimize,
+            lr=args.learning_rate,
            betas=(args.adam_beta1, args.adam_beta2),
            beta3=args.prodigy_beta3,
            weight_decay=args.adam_weight_decay,
@@ -1367,8 +1359,6 @@ def main(args):
        gc.collect()
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
-        elif is_torch_npu_available():
-            torch_npu.npu.empty_cache()

    # If custom instance prompts are NOT provided (i.e. the instance prompt is used for all images),
    # pack the statically computed variables appropriately here. This is so that we don't
@@ -1550,12 +1540,12 @@ def main(args):
                model_input = (model_input - vae.config.shift_factor) * vae.config.scaling_factor
                model_input = model_input.to(dtype=weight_dtype)

-                vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
+                vae_scale_factor = 2 ** (len(vae.config.block_out_channels))

                latent_image_ids = FluxPipeline._prepare_latent_image_ids(
                    model_input.shape[0],
-                    model_input.shape[2] // 2,
-                    model_input.shape[3] // 2,
+                    model_input.shape[2],
+                    model_input.shape[3],
                    accelerator.device,
                    weight_dtype,
                )
@@ -1590,7 +1580,7 @@ def main(args):
                )

                # handle guidance
-                if accelerator.unwrap_model(transformer).config.guidance_embeds:
+                if transformer.config.guidance_embeds:
                    guidance = torch.tensor([args.guidance_scale], device=accelerator.device)
                    guidance = guidance.expand(model_input.shape[0])
                else:
@@ -1611,8 +1601,8 @@ def main(args):
                # upscaling height & width as discussed in https://github.com/huggingface/diffusers/pull/9257#discussion_r1731108042
                model_pred = FluxPipeline._unpack_latents(
                    model_pred,
-                    height=model_input.shape[2] * vae_scale_factor,
-                    width=model_input.shape[3] * vae_scale_factor,
+                    height=int(model_input.shape[2] * vae_scale_factor / 2),
+                    width=int(model_input.shape[3] * vae_scale_factor / 2),
                    vae_scale_factor=vae_scale_factor,
                )

@@ -1704,8 +1694,6 @@ def main(args):
                # create pipeline
                if not args.train_text_encoder:
                    text_encoder_one, text_encoder_two = load_text_encoders(text_encoder_cls_one, text_encoder_cls_two)
-                    text_encoder_one.to(weight_dtype)
-                    text_encoder_two.to(weight_dtype)
                else:  # even when training the text encoder we're only training text encoder one
                    text_encoder_two = text_encoder_cls_two.from_pretrained(
                        args.pretrained_model_name_or_path,
@@ -1734,15 +1722,9 @@ def main(args):
                )
                if not args.train_text_encoder:
                    del text_encoder_one, text_encoder_two
-                    if torch.cuda.is_available():
-                        torch.cuda.empty_cache()
-                    elif is_torch_npu_available():
-                        torch_npu.npu.empty_cache()
+                    torch.cuda.empty_cache()
                    gc.collect()

-                images = None
-                del pipeline
-
    # Save the lora layers
    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
@@ -1801,9 +1783,6 @@ def main(args):
                ignore_patterns=["step_*", "epoch_*"],
            )

-        images = None
-        del pipeline
-
    accelerator.end_training()


@@ -70,7 +70,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -72,7 +72,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -177,7 +177,7 @@ def log_validation(
        f"Running validation... \n Generating {args.num_validation_images} images with prompt:"
        f" {args.validation_prompt}."
    )
-    pipeline = pipeline.to(accelerator.device)
+    pipeline = pipeline.to(accelerator.device, dtype=torch_dtype)
    pipeline.set_progress_bar_config(disable=True)

    # run inference
@@ -554,15 +554,6 @@ def parse_args(input_args=None):
        "--adam_weight_decay_text_encoder", type=float, default=1e-03, help="Weight decay to use for text_encoder"
    )

-    parser.add_argument(
-        "--lora_layers",
-        type=str,
-        default=None,
-        help=(
-            'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
-        ),
-    )
-
    parser.add_argument(
        "--adam_epsilon",
        type=float,
@@ -1195,30 +1186,12 @@ def main(args):
        if args.train_text_encoder:
            text_encoder_one.gradient_checkpointing_enable()

-    if args.lora_layers is not None:
-        target_modules = [layer.strip() for layer in args.lora_layers.split(",")]
-    else:
-        target_modules = [
-            "attn.to_k",
-            "attn.to_q",
-            "attn.to_v",
-            "attn.to_out.0",
-            "attn.add_k_proj",
-            "attn.add_q_proj",
-            "attn.add_v_proj",
-            "attn.to_add_out",
-            "ff.net.0.proj",
-            "ff.net.2",
-            "ff_context.net.0.proj",
-            "ff_context.net.2",
-        ]
-
-    # now we will add new LoRA weights the transformer layers
+    # now we will add new LoRA weights to the attention layers
    transformer_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
        init_lora_weights="gaussian",
-        target_modules=target_modules,
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
    )
    transformer.add_adapter(transformer_lora_config)
    if args.train_text_encoder:
@@ -1335,7 +1308,10 @@ def main(args):
            "weight_decay": args.adam_weight_decay_text_encoder,
            "lr": args.text_encoder_lr if args.text_encoder_lr else args.learning_rate,
        }
-        params_to_optimize = [transformer_parameters_with_lr, text_parameters_one_with_lr]
+        params_to_optimize = [
+            transformer_parameters_with_lr,
+            text_parameters_one_with_lr,
+        ]
    else:
        params_to_optimize = [transformer_parameters_with_lr]

@@ -1391,12 +1367,14 @@ def main(args):
                f" {args.text_encoder_lr} and learning_rate: {args.learning_rate}. "
                f"When using prodigy only learning_rate is used as the initial learning rate."
            )
-            # changes the learning rate of text_encoder_parameters_one to be
+            # changes the learning rate of text_encoder_parameters_one and text_encoder_parameters_two to be
            # --learning_rate
            params_to_optimize[1]["lr"] = args.learning_rate
+            params_to_optimize[2]["lr"] = args.learning_rate

        optimizer = optimizer_class(
            params_to_optimize,
+            lr=args.learning_rate,
            betas=(args.adam_beta1, args.adam_beta2),
            beta3=args.prodigy_beta3,
            weight_decay=args.adam_weight_decay,
@@ -1667,12 +1645,12 @@ def main(args):
                model_input = (model_input - vae_config_shift_factor) * vae_config_scaling_factor
                model_input = model_input.to(dtype=weight_dtype)

-                vae_scale_factor = 2 ** (len(vae_config_block_out_channels) - 1)
+                vae_scale_factor = 2 ** (len(vae_config_block_out_channels))

                latent_image_ids = FluxPipeline._prepare_latent_image_ids(
                    model_input.shape[0],
-                    model_input.shape[2] // 2,
-                    model_input.shape[3] // 2,
+                    model_input.shape[2],
+                    model_input.shape[3],
                    accelerator.device,
                    weight_dtype,
                )
@@ -1706,7 +1684,7 @@ def main(args):
                )

                # handle guidance
-                if accelerator.unwrap_model(transformer).config.guidance_embeds:
+                if transformer.config.guidance_embeds:
                    guidance = torch.tensor([args.guidance_scale], device=accelerator.device)
                    guidance = guidance.expand(model_input.shape[0])
                else:
@@ -1726,8 +1704,8 @@ def main(args):
                )[0]
                model_pred = FluxPipeline._unpack_latents(
                    model_pred,
-                    height=model_input.shape[2] * vae_scale_factor,
-                    width=model_input.shape[3] * vae_scale_factor,
+                    height=int(model_input.shape[2] * vae_scale_factor / 2),
+                    width=int(model_input.shape[3] * vae_scale_factor / 2),
                    vae_scale_factor=vae_scale_factor,
                )

@@ -1819,8 +1797,6 @@ def main(args):
                # create pipeline
                if not args.train_text_encoder:
                    text_encoder_one, text_encoder_two = load_text_encoders(text_encoder_cls_one, text_encoder_cls_two)
-                    text_encoder_one.to(weight_dtype)
-                    text_encoder_two.to(weight_dtype)
                pipeline = FluxPipeline.from_pretrained(
                    args.pretrained_model_name_or_path,
                    vae=vae,
@@ -1844,9 +1820,6 @@ def main(args):
                    del text_encoder_one, text_encoder_two
                    free_memory()

-                images = None
-                del pipeline
-
    # Save the lora layers
    accelerator.wait_for_everyone()
    if accelerator.is_main_process:
@@ -1911,9 +1884,6 @@ def main(args):
                ignore_patterns=["step_*", "epoch_*"],
            )

-        images = None
-        del pipeline
-
    accelerator.end_training()


@@ -72,7 +72,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -86,15 +86,6 @@ def save_model_card(
    validation_prompt=None,
    repo_folder=None,
 ):
-    if "large" in base_model:
-        model_variant = "SD3.5-Large"
-        license_url = "https://huggingface.co/stabilityai/stable-diffusion-3.5-large/blob/main/LICENSE.md"
-        variant_tags = ["sd3.5-large", "sd3.5", "sd3.5-diffusers"]
-    else:
-        model_variant = "SD3"
-        license_url = "https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/LICENSE.md"
-        variant_tags = ["sd3", "sd3-diffusers"]
-
    widget_dict = []
    if images is not None:
        for i, image in enumerate(images):
@@ -104,7 +95,7 @@ def save_model_card(
            )

    model_description = f"""
-# {model_variant} DreamBooth LoRA - {repo_id}
+# SD3 DreamBooth LoRA - {repo_id}

 <Gallery />

@@ -129,7 +120,7 @@ You should use `{instance_prompt}` to trigger the image generation.
 ```py
 from diffusers import AutoPipelineForText2Image
 import torch
-pipeline = AutoPipelineForText2Image.from_pretrained({base_model}, torch_dtype=torch.float16).to('cuda')
+pipeline = AutoPipelineForText2Image.from_pretrained('stabilityai/stable-diffusion-3-medium-diffusers', torch_dtype=torch.float16).to('cuda')
 pipeline.load_lora_weights('{repo_id}', weight_name='pytorch_lora_weights.safetensors')
 image = pipeline('{validation_prompt if validation_prompt else instance_prompt}').images[0]
 ```
@@ -144,7 +135,7 @@ For more details, including weighting, merging and fusing LoRAs, check the [docu

 ## License

-Please adhere to the licensing terms as described [here]({license_url}).
+Please adhere to the licensing terms as described [here](https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/LICENSE).
 """
    model_card = load_or_create_model_card(
        repo_id_or_path=repo_id,
@@ -160,11 +151,11 @@ Please adhere to the licensing terms as described [here]({license_url}).
        "diffusers-training",
        "diffusers",
        "lora",
+        "sd3",
+        "sd3-diffusers",
        "template:sd-lora",
    ]

-    tags += variant_tags
-
    model_card = populate_model_card(model_card, tags=tags)
    model_card.save(os.path.join(repo_folder, "README.md"))

@@ -571,25 +562,6 @@ def parse_args(input_args=None):
        "--adam_weight_decay_text_encoder", type=float, default=1e-03, help="Weight decay to use for text_encoder"
    )

-    parser.add_argument(
-        "--lora_layers",
-        type=str,
-        default=None,
-        help=(
-            "The transformer block layers to apply LoRA training on. Please specify the layers in a comma seperated string."
-            "For examples refer to https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_SD3.md"
-        ),
-    )
-    parser.add_argument(
-        "--lora_blocks",
-        type=str,
-        default=None,
-        help=(
-            "The transformer blocks to apply LoRA training on. Please specify the block numbers in a comma seperated manner."
-            'E.g. - "--lora_blocks 12,30" will result in lora training of transformer blocks 12 and 30. For more examples refer to https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_SD3.md'
-        ),
-    )
-
    parser.add_argument(
        "--adam_epsilon",
        type=float,
@@ -1241,31 +1213,13 @@ def main(args):
        if args.train_text_encoder:
            text_encoder_one.gradient_checkpointing_enable()
            text_encoder_two.gradient_checkpointing_enable()
-    if args.lora_layers is not None:
-        target_modules = [layer.strip() for layer in args.lora_layers.split(",")]
-    else:
-        target_modules = [
-            "attn.add_k_proj",
-            "attn.add_q_proj",
-            "attn.add_v_proj",
-            "attn.to_add_out",
-            "attn.to_k",
-            "attn.to_out.0",
-            "attn.to_q",
-            "attn.to_v",
-        ]
-    if args.lora_blocks is not None:
-        target_blocks = [int(block.strip()) for block in args.lora_blocks.split(",")]
-        target_modules = [
-            f"transformer_blocks.{block}.{module}" for block in target_blocks for module in target_modules
-        ]

    # now we will add new LoRA weights to the attention layers
    transformer_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
        init_lora_weights="gaussian",
-        target_modules=target_modules,
+        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
    )
    transformer.add_adapter(transformer_lora_config)

@@ -1468,6 +1422,7 @@ def main(args):

        optimizer = optimizer_class(
            params_to_optimize,
+            lr=args.learning_rate,
            betas=(args.adam_beta1, args.adam_beta2),
            beta3=args.prodigy_beta3,
            weight_decay=args.adam_weight_decay,
@@ -78,7 +78,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -1402,6 +1402,7 @@ def main(args):

        optimizer = optimizer_class(
            params_to_optimize,
+            lr=args.learning_rate,
            betas=(args.adam_beta1, args.adam_beta2),
            beta3=args.prodigy_beta3,
            weight_decay=args.adam_weight_decay,
@@ -63,7 +63,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -77,15 +77,6 @@ def save_model_card(
    validation_prompt=None,
    repo_folder=None,
 ):
-    if "large" in base_model:
-        model_variant = "SD3.5-Large"
-        license_url = "https://huggingface.co/stabilityai/stable-diffusion-3.5-large/blob/main/LICENSE.md"
-        variant_tags = ["sd3.5-large", "sd3.5", "sd3.5-diffusers"]
-    else:
-        model_variant = "SD3"
-        license_url = "https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/LICENSE.md"
-        variant_tags = ["sd3", "sd3-diffusers"]
-
    widget_dict = []
    if images is not None:
        for i, image in enumerate(images):
@@ -95,7 +86,7 @@ def save_model_card(
            )

    model_description = f"""
-# {model_variant} DreamBooth - {repo_id}
+# SD3 DreamBooth - {repo_id}

 <Gallery />

@@ -122,7 +113,7 @@ image = pipeline('{validation_prompt if validation_prompt else instance_prompt}'

 ## License

-Please adhere to the licensing terms as described `[here]({license_url})`.
+Please adhere to the licensing terms as described `[here](https://huggingface.co/stabilityai/stable-diffusion-3-medium/blob/main/LICENSE)`.
 """
    model_card = load_or_create_model_card(
        repo_id_or_path=repo_id,
@@ -137,9 +128,10 @@ Please adhere to the licensing terms as described `[here]({license_url})`.
        "text-to-image",
        "diffusers-training",
        "diffusers",
+        "sd3",
+        "sd3-diffusers",
        "template:sd-lora",
    ]
-    tags += variant_tags

    model_card = populate_model_card(model_card, tags=tags)
    model_card.save(os.path.join(repo_folder, "README.md"))
@@ -902,26 +894,20 @@ def _encode_prompt_with_clip(
    tokenizer,
    prompt: str,
    device=None,
-    text_input_ids=None,
    num_images_per_prompt: int = 1,
 ):
    prompt = [prompt] if isinstance(prompt, str) else prompt
    batch_size = len(prompt)

-    if tokenizer is not None:
-        text_inputs = tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=77,
-            truncation=True,
-            return_tensors="pt",
-        )
-
-        text_input_ids = text_inputs.input_ids
-    else:
-        if text_input_ids is None:
-            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_tensors="pt",
+    )

+    text_input_ids = text_inputs.input_ids
    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=True)

    pooled_prompt_embeds = prompt_embeds[0]
@@ -943,7 +929,6 @@ def encode_prompt(
    max_sequence_length,
    device=None,
    num_images_per_prompt: int = 1,
-    text_input_ids_list=None,
 ):
    prompt = [prompt] if isinstance(prompt, str) else prompt

@@ -952,14 +937,13 @@ def encode_prompt(

    clip_prompt_embeds_list = []
    clip_pooled_prompt_embeds_list = []
-    for i, (tokenizer, text_encoder) in enumerate(zip(clip_tokenizers, clip_text_encoders)):
+    for tokenizer, text_encoder in zip(clip_tokenizers, clip_text_encoders):
        prompt_embeds, pooled_prompt_embeds = _encode_prompt_with_clip(
            text_encoder=text_encoder,
            tokenizer=tokenizer,
            prompt=prompt,
            device=device if device is not None else text_encoder.device,
            num_images_per_prompt=num_images_per_prompt,
-            text_input_ids=text_input_ids_list[i] if text_input_ids_list else None,
        )
        clip_prompt_embeds_list.append(prompt_embeds)
        clip_pooled_prompt_embeds_list.append(pooled_prompt_embeds)
@@ -1336,6 +1320,7 @@ def main(args):

        optimizer = optimizer_class(
            params_to_optimize,
+            lr=args.learning_rate,
            betas=(args.adam_beta1, args.adam_beta2),
            beta3=args.prodigy_beta3,
            weight_decay=args.adam_weight_decay,
@@ -57,7 +57,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -52,7 +52,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -46,7 +46,7 @@ from diffusers.utils import check_min_version, is_wandb_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -46,7 +46,7 @@ from diffusers.utils import check_min_version, is_wandb_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -51,7 +51,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -1,13 +1,4 @@
-
-## Diffusion-based Policy Learning for RL
-
-`diffusion_policy` implements [Diffusion Policy](https://diffusion-policy.cs.columbia.edu/), a diffusion model that predicts robot action sequences in reinforcement learning tasks.
-
-This example implements a robot control model for pushing a T-shaped block into a target area. The model takes in current state observations as input, and outputs a trajectory of subsequent steps to follow.
-
-To execute the script, run `diffusion_policy.py`
-
-## Diffuser Locomotion
+# Overview

 These examples show how to run [Diffuser](https://arxiv.org/abs/2205.09991) in Diffusers.
 There are two ways to use the script, `run_diffuser_locomotion.py`.
@@ -1,201 +0,0 @@
-import numpy as np
-import numpy.core.multiarray as multiarray
-import torch
-import torch.nn as nn
-from huggingface_hub import hf_hub_download
-from torch.serialization import add_safe_globals
-
-from diffusers import DDPMScheduler, UNet1DModel
-
-
-add_safe_globals(
-    [
-        multiarray._reconstruct,
-        np.ndarray,
-        np.dtype,
-        np.dtype(np.float32).type,
-        np.dtype(np.float64).type,
-        np.dtype(np.int32).type,
-        np.dtype(np.int64).type,
-        type(np.dtype(np.float32)),
-        type(np.dtype(np.float64)),
-        type(np.dtype(np.int32)),
-        type(np.dtype(np.int64)),
-    ]
-)
-
-"""
-An example of using HuggingFace's diffusers library for diffusion policy,
-generating smooth movement trajectories.
-
-This implements a robot control model for pushing a T-shaped block into a target area.
-The model takes in the robot arm position, block position, and block angle,
-then outputs a sequence of 16 (x,y) positions for the robot arm to follow.
-"""
-
-
-class ObservationEncoder(nn.Module):
-    """
-    Converts raw robot observations (positions/angles) into a more compact representation
-
-    state_dim (int): Dimension of the input state vector (default: 5)
-        [robot_x, robot_y, block_x, block_y, block_angle]
-
-    - Input shape: (batch_size, state_dim)
-    - Output shape: (batch_size, 256)
-    """
-
-    def __init__(self, state_dim):
-        super().__init__()
-        self.net = nn.Sequential(nn.Linear(state_dim, 512), nn.ReLU(), nn.Linear(512, 256))
-
-    def forward(self, x):
-        return self.net(x)
-
-
-class ObservationProjection(nn.Module):
-    """
-    Takes the encoded observation and transforms it into 32 values that represent the current robot/block situation.
-    These values are used as additional contextual information during the diffusion model's trajectory generation.
-
-    - Input: 256-dim vector (padded to 512)
-            Shape: (batch_size, 256)
-    - Output: 32 contextual information values for the diffusion model
-            Shape: (batch_size, 32)
-    """
-
-    def __init__(self):
-        super().__init__()
-        self.weight = nn.Parameter(torch.randn(32, 512))
-        self.bias = nn.Parameter(torch.zeros(32))
-
-    def forward(self, x):  # pad 256-dim input to 512-dim with zeros
-        if x.size(-1) == 256:
-            x = torch.cat([x, torch.zeros(*x.shape[:-1], 256, device=x.device)], dim=-1)
-        return nn.functional.linear(x, self.weight, self.bias)
-
-
-class DiffusionPolicy:
-    """
-    Implements diffusion policy for generating robot arm trajectories.
-    Uses diffusion to generate sequences of positions for a robot arm, conditioned on
-    the current state of the robot and the block it needs to push.
-
-    The model expects observations in pixel coordinates (0-512 range) and block angle in radians.
-    It generates trajectories as sequences of (x,y) coordinates also in the 0-512 range.
-    """
-
-    def __init__(self, state_dim=5, device="cpu"):
-        self.device = device
-
-        # define valid ranges for inputs/outputs
-        self.stats = {
-            "obs": {"min": torch.zeros(5), "max": torch.tensor([512, 512, 512, 512, 2 * np.pi])},
-            "action": {"min": torch.zeros(2), "max": torch.full((2,), 512)},
-        }
-
-        self.obs_encoder = ObservationEncoder(state_dim).to(device)
-        self.obs_projection = ObservationProjection().to(device)
-
-        # UNet model that performs the denoising process
-        # takes in concatenated action (2 channels) and context (32 channels) = 34 channels
-        # outputs predicted action (2 channels for x,y coordinates)
-        self.model = UNet1DModel(
-            sample_size=16,  # length of trajectory sequence
-            in_channels=34,
-            out_channels=2,
-            layers_per_block=2,  # number of layers per each UNet block
-            block_out_channels=(128,),  # number of output neurons per layer in each block
-            down_block_types=("DownBlock1D",),  # reduce the resolution of data
-            up_block_types=("UpBlock1D",),  # increase the resolution of data
-        ).to(device)
-
-        # noise scheduler that controls the denoising process
-        self.noise_scheduler = DDPMScheduler(
-            num_train_timesteps=100,  # number of denoising steps
-            beta_schedule="squaredcos_cap_v2",  # type of noise schedule
-        )
-
-        # load pre-trained weights from HuggingFace
-        checkpoint = torch.load(
-            hf_hub_download("dorsar/diffusion_policy", "push_tblock.pt"), weights_only=True, map_location=device
-        )
-        self.model.load_state_dict(checkpoint["model_state_dict"])
-        self.obs_encoder.load_state_dict(checkpoint["encoder_state_dict"])
-        self.obs_projection.load_state_dict(checkpoint["projection_state_dict"])
-
-    # scales data to [-1, 1] range for neural network processing
-    def normalize_data(self, data, stats):
-        return ((data - stats["min"]) / (stats["max"] - stats["min"])) * 2 - 1
-
-    # converts normalized data back to original range
-    def unnormalize_data(self, ndata, stats):
-        return ((ndata + 1) / 2) * (stats["max"] - stats["min"]) + stats["min"]
-
-    @torch.no_grad()
-    def predict(self, observation):
-        """
-        Generates a trajectory of robot arm positions given the current state.
-
-        Args:
-            observation (torch.Tensor): Current state [robot_x, robot_y, block_x, block_y, block_angle]
-                                    Shape: (batch_size, 5)
-
-        Returns:
-            torch.Tensor: Sequence of (x,y) positions for the robot arm to follow
-                        Shape: (batch_size, 16, 2) where:
-                        - 16 is the number of steps in the trajectory
-                        - 2 is the (x,y) coordinates in pixel space (0-512)
-
-        The function first encodes the observation, then uses it to condition a diffusion
-        process that gradually denoises random trajectories into smooth, purposeful movements.
-        """
-        observation = observation.to(self.device)
-        normalized_obs = self.normalize_data(observation, self.stats["obs"])
-
-        # encode the observation into context values for the diffusion model
-        cond = self.obs_projection(self.obs_encoder(normalized_obs))
-        # keeps first & second dimension sizes unchanged, and multiplies last dimension by 16
-        cond = cond.view(normalized_obs.shape[0], -1, 1).expand(-1, -1, 16)
-
-        # initialize action with noise - random noise that will be refined into a trajectory
-        action = torch.randn((observation.shape[0], 2, 16), device=self.device)
-
-        # denoise
-        # at each step `t`, the current noisy trajectory (`action`) & conditioning info (context) are
-        # fed into the model to predict a denoised trajectory, then uses self.noise_scheduler.step to
-        # apply this prediction & slightly reduce the noise in `action` more
-
-        self.noise_scheduler.set_timesteps(100)
-        for t in self.noise_scheduler.timesteps:
-            model_output = self.model(torch.cat([action, cond], dim=1), t)
-            action = self.noise_scheduler.step(model_output.sample, t, action).prev_sample
-
-        action = action.transpose(1, 2)  # reshape to [batch, 16, 2]
-        action = self.unnormalize_data(action, self.stats["action"])  # scale back to coordinates
-        return action
-
-
-if __name__ == "__main__":
-    policy = DiffusionPolicy()
-
-    # sample of a single observation
-    # robot arm starts in center, block is slightly left and up, rotated 90 degrees
-    obs = torch.tensor(
-        [
-            [
-                256.0,  # robot arm x position (middle of screen)
-                256.0,  # robot arm y position (middle of screen)
-                200.0,  # block x position
-                300.0,  # block y position
-                np.pi / 2,  # block angle (90 degrees)
-            ]
-        ]
-    )
-
-    action = policy.predict(obs)
-
-    print("Action shape:", action.shape)  # should be [1, 16, 2] - one trajectory of 16 x,y positions
-    print("\nPredicted trajectory:")
-    for i, (x, y) in enumerate(action[0]):
-        print(f"Step {i:2d}: x={x:6.1f}, y={y:6.1f}")
@@ -1,167 +0,0 @@
-## LoRA fine-tuning Flux.1 Dev with quantization
-
-> [!NOTE]  
-> This example is educational in nature and fixes some arguments to keep things simple. It should act as a reference to build things further.
-
-This example shows how to fine-tune [Flux.1 Dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) with LoRA and quantization. We show this by using the [`Norod78/Yarn-art-style`](https://huggingface.co/datasets/Norod78/Yarn-art-style) dataset. Steps below summarize the workflow:
-
-* We precompute the text embeddings in `compute_embeddings.py` and serialize them into a parquet file.
-  * Even though optional, we load the T5-xxl in NF4 to further reduce the memory foot-print. 
-* `train_dreambooth_lora_flux_miniature.py` takes care of training:
-  * Since we already precomputed the text embeddings, we don't load the text encoders.
-  * We load the VAE and use it to precompute the image latents and we then delete it. 
-  * Load the Flux transformer, quantize it with the [NF4 datatype](https://arxiv.org/abs/2305.14314) through `bitsandbytes`, prepare it for 4bit training. 
-  * Add LoRA adapter layers to it and then ensure they are kept in FP32 precision.
-  * Train!
-
-To run training in a memory-optimized manner, we additionally use:
-
-* 8Bit Adam
-* Gradient checkpointing 
-
-We have tested the scripts on a 24GB 4090. It works on a free-tier Colab Notebook, too, but it's extremely slow. 
-
-## Training
-
-Ensure you have installed the required libraries:
-
-```bash
-pip install -U transformers accelerate bitsandbytes peft datasets 
-pip install git+https://github.com/huggingface/diffusers -U
-```
-
-Now, compute the text embeddings:
-
-```bash
-python compute_embeddings.py
-```
-
-It should create a file named `embeddings.parquet`. We're then ready to launch training. First, authenticate so that you can access the Flux.1 Dev model: 
-
-```bash
-huggingface-cli
-```
-
-Then launch:
-
-```bash
-accelerate launch --config_file=accelerate.yaml \
-  train_dreambooth_lora_flux_miniature.py \
-  --pretrained_model_name_or_path="black-forest-labs/FLUX.1-dev" \
-  --data_df_path="embeddings.parquet" \
-  --output_dir="yarn_art_lora_flux_nf4" \
-  --mixed_precision="fp16" \
-  --use_8bit_adam \
-  --weighting_scheme="none" \
-  --resolution=1024 \
-  --train_batch_size=1 \
-  --repeats=1 \
-  --learning_rate=1e-4 \
-  --guidance_scale=1 \
-  --report_to="wandb" \
-  --gradient_accumulation_steps=4 \
-  --gradient_checkpointing \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --cache_latents \
-  --rank=4 \
-  --max_train_steps=700 \
-  --seed="0"
-```
-
-We can direcly pass a quantized checkpoint path, too:
-
-```diff
-+ --quantized_model_path="hf-internal-testing/flux.1-dev-nf4-pkg"
-```
-
-Depending on the machine, training time will vary but for our case, it was 1.5 hours. It maybe possible to speed this up by using `torch.bfloat16`. 
-
-We support training with the DeepSpeed Zero2 optimizer, too. To use it, first install DeepSpeed:
-
-```bash
-pip install -Uq deepspeed
-```
-
-And then launch:
-
-```bash
-accelerate launch --config_file=ds2.yaml \
-  train_dreambooth_lora_flux_miniature.py \
-  --pretrained_model_name_or_path="black-forest-labs/FLUX.1-dev" \
-  --data_df_path="embeddings.parquet" \
-  --output_dir="yarn_art_lora_flux_nf4" \
-  --mixed_precision="no" \
-  --use_8bit_adam \
-  --weighting_scheme="none" \
-  --resolution=1024 \
-  --train_batch_size=1 \
-  --repeats=1 \
-  --learning_rate=1e-4 \
-  --guidance_scale=1 \
-  --report_to="wandb" \
-  --gradient_accumulation_steps=4 \
-  --gradient_checkpointing \
-  --lr_scheduler="constant" \
-  --lr_warmup_steps=0 \
-  --cache_latents \
-  --rank=4 \
-  --max_train_steps=700 \
-  --seed="0"
-```
-
-## Inference
-
-When loading the LoRA params (that were obtained on a quantized base model) and merging them into the base model, it is recommended to first dequantize the base model, merge the LoRA params into it, and then quantize the model again. This is because merging into 4bit quantized models can lead to some rounding errors. Below, we provide an end-to-end example:
-
-1. First, load the original model and merge the LoRA params into it:
-
-```py
-from diffusers import FluxPipeline 
-import torch 
-
-ckpt_id = "black-forest-labs/FLUX.1-dev"
-pipeline = FluxPipeline.from_pretrained(
-    ckpt_id, text_encoder=None, text_encoder_2=None, torch_dtype=torch.float16
-)
-pipeline.load_lora_weights("yarn_art_lora_flux_nf4", weight_name="pytorch_lora_weights.safetensors")
-pipeline.fuse_lora()
-pipeline.unload_lora_weights()
-
-pipeline.transformer.save_pretrained("fused_transformer")
-```
-
-2. Quantize the model and run inference
-
-```py
-from diffusers import AutoPipelineForText2Image, FluxTransformer2DModel, BitsAndBytesConfig
-import torch
-
-ckpt_id = "black-forest-labs/FLUX.1-dev"
-bnb_4bit_compute_dtype = torch.float16
-nf4_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=bnb_4bit_compute_dtype,
-)
-transformer = FluxTransformer2DModel.from_pretrained(
-    "fused_transformer",
-    quantization_config=nf4_config,
-    torch_dtype=bnb_4bit_compute_dtype,
-)
-pipeline = AutoPipelineForText2Image.from_pretrained(
-    ckpt_id, transformer=transformer, torch_dtype=bnb_4bit_compute_dtype
-)
-pipeline.enable_model_cpu_offload()
-
-image = pipeline(
-    "a puppy in a pond, yarn art style", num_inference_steps=28, guidance_scale=3.5, height=768
-).images[0]
-image.save("yarn_merged.png")
-```
-
-|   Dequantize, merge, quantize   |   Merging directly into quantized model   |
-|-------|-------|
-| ![Image A](https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/quantized_flux_training/merged.png) | ![Image B](https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/quantized_flux_training/unmerged.png) |
-
-As we can notice the first column result follows the style more closely.
@@ -1,17 +0,0 @@
-compute_environment: LOCAL_MACHINE
-debug: false
-distributed_type: NO
-downcast_bf16: 'no'
-enable_cpu_affinity: true
-gpu_ids: all
-machine_rank: 0
-main_training_function: main
-mixed_precision: bf16
-num_machines: 1
-num_processes: 1
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
@@ -1,107 +0,0 @@
-#!/usr/bin/env python
-# coding=utf-8
-# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-
-import pandas as pd
-import torch
-from datasets import load_dataset
-from huggingface_hub.utils import insecure_hashlib
-from tqdm.auto import tqdm
-from transformers import T5EncoderModel
-
-from diffusers import FluxPipeline
-
-
-MAX_SEQ_LENGTH = 77
-OUTPUT_PATH = "embeddings.parquet"
-
-
-def generate_image_hash(image):
-    return insecure_hashlib.sha256(image.tobytes()).hexdigest()
-
-
-def load_flux_dev_pipeline():
-    id = "black-forest-labs/FLUX.1-dev"
-    text_encoder = T5EncoderModel.from_pretrained(id, subfolder="text_encoder_2", load_in_8bit=True, device_map="auto")
-    pipeline = FluxPipeline.from_pretrained(
-        id, text_encoder_2=text_encoder, transformer=None, vae=None, device_map="balanced"
-    )
-    return pipeline
-
-
-@torch.no_grad()
-def compute_embeddings(pipeline, prompts, max_sequence_length):
-    all_prompt_embeds = []
-    all_pooled_prompt_embeds = []
-    all_text_ids = []
-    for prompt in tqdm(prompts, desc="Encoding prompts."):
-        (
-            prompt_embeds,
-            pooled_prompt_embeds,
-            text_ids,
-        ) = pipeline.encode_prompt(prompt=prompt, prompt_2=None, max_sequence_length=max_sequence_length)
-        all_prompt_embeds.append(prompt_embeds)
-        all_pooled_prompt_embeds.append(pooled_prompt_embeds)
-        all_text_ids.append(text_ids)
-
-    max_memory = torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024
-    print(f"Max memory allocated: {max_memory:.3f} GB")
-    return all_prompt_embeds, all_pooled_prompt_embeds, all_text_ids
-
-
-def run(args):
-    dataset = load_dataset("Norod78/Yarn-art-style", split="train")
-    image_prompts = {generate_image_hash(sample["image"]): sample["text"] for sample in dataset}
-    all_prompts = list(image_prompts.values())
-    print(f"{len(all_prompts)=}")
-
-    pipeline = load_flux_dev_pipeline()
-    all_prompt_embeds, all_pooled_prompt_embeds, all_text_ids = compute_embeddings(
-        pipeline, all_prompts, args.max_sequence_length
-    )
-
-    data = []
-    for i, (image_hash, _) in enumerate(image_prompts.items()):
-        data.append((image_hash, all_prompt_embeds[i], all_pooled_prompt_embeds[i], all_text_ids[i]))
-    print(f"{len(data)=}")
-
-    # Create a DataFrame
-    embedding_cols = ["prompt_embeds", "pooled_prompt_embeds", "text_ids"]
-    df = pd.DataFrame(data, columns=["image_hash"] + embedding_cols)
-    print(f"{len(df)=}")
-
-    # Convert embedding lists to arrays (for proper storage in parquet)
-    for col in embedding_cols:
-        df[col] = df[col].apply(lambda x: x.cpu().numpy().flatten().tolist())
-
-    # Save the dataframe to a parquet file
-    df.to_parquet(args.output_path)
-    print(f"Data successfully serialized to {args.output_path}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--max_sequence_length",
-        type=int,
-        default=MAX_SEQ_LENGTH,
-        help="Maximum sequence length to use for computing the embeddings. The more the higher computational costs.",
-    )
-    parser.add_argument("--output_path", type=str, default=OUTPUT_PATH, help="Path to serialize the parquet file.")
-    args = parser.parse_args()
-
-    run(args)
@@ -1,23 +0,0 @@
-compute_environment: LOCAL_MACHINE
-debug: false
-deepspeed_config:
-  gradient_accumulation_steps: 1
-  gradient_clipping: 1.0
-  offload_optimizer_device: cpu
-  offload_param_device: cpu
-  zero3_init_flag: false
-  zero_stage: 2
-distributed_type: DEEPSPEED
-downcast_bf16: 'no'
-enable_cpu_affinity: false
-machine_rank: 0
-main_training_function: main
-mixed_precision: 'no'
-num_machines: 1
-num_processes: 1
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
@@ -1,2 +0,0 @@
-images/
-output/
@@ -1,307 +0,0 @@
-from typing import Any, Dict, Optional
-
-import torch
-from torch import nn
-
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.models import PixArtTransformer2DModel
-from diffusers.models.attention import BasicTransformerBlock
-from diffusers.models.modeling_outputs import Transformer2DModelOutput
-from diffusers.models.modeling_utils import ModelMixin
-from diffusers.utils.torch_utils import is_torch_version
-
-
-class PixArtControlNetAdapterBlock(nn.Module):
-    def __init__(
-        self,
-        block_index,
-        # taken from PixArtTransformer2DModel
-        num_attention_heads: int = 16,
-        attention_head_dim: int = 72,
-        dropout: float = 0.0,
-        cross_attention_dim: Optional[int] = 1152,
-        attention_bias: bool = True,
-        activation_fn: str = "gelu-approximate",
-        num_embeds_ada_norm: Optional[int] = 1000,
-        upcast_attention: bool = False,
-        norm_type: str = "ada_norm_single",
-        norm_elementwise_affine: bool = False,
-        norm_eps: float = 1e-6,
-        attention_type: Optional[str] = "default",
-    ):
-        super().__init__()
-
-        self.block_index = block_index
-        self.inner_dim = num_attention_heads * attention_head_dim
-
-        # the first block has a zero before layer
-        if self.block_index == 0:
-            self.before_proj = nn.Linear(self.inner_dim, self.inner_dim)
-            nn.init.zeros_(self.before_proj.weight)
-            nn.init.zeros_(self.before_proj.bias)
-
-        self.transformer_block = BasicTransformerBlock(
-            self.inner_dim,
-            num_attention_heads,
-            attention_head_dim,
-            dropout=dropout,
-            cross_attention_dim=cross_attention_dim,
-            activation_fn=activation_fn,
-            num_embeds_ada_norm=num_embeds_ada_norm,
-            attention_bias=attention_bias,
-            upcast_attention=upcast_attention,
-            norm_type=norm_type,
-            norm_elementwise_affine=norm_elementwise_affine,
-            norm_eps=norm_eps,
-            attention_type=attention_type,
-        )
-
-        self.after_proj = nn.Linear(self.inner_dim, self.inner_dim)
-        nn.init.zeros_(self.after_proj.weight)
-        nn.init.zeros_(self.after_proj.bias)
-
-    def train(self, mode: bool = True):
-        self.transformer_block.train(mode)
-
-        if self.block_index == 0:
-            self.before_proj.train(mode)
-
-        self.after_proj.train(mode)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        controlnet_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        timestep: Optional[torch.LongTensor] = None,
-        added_cond_kwargs: Dict[str, torch.Tensor] = None,
-        cross_attention_kwargs: Dict[str, Any] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-    ):
-        if self.block_index == 0:
-            controlnet_states = self.before_proj(controlnet_states)
-            controlnet_states = hidden_states + controlnet_states
-
-        controlnet_states_down = self.transformer_block(
-            hidden_states=controlnet_states,
-            encoder_hidden_states=encoder_hidden_states,
-            timestep=timestep,
-            added_cond_kwargs=added_cond_kwargs,
-            cross_attention_kwargs=cross_attention_kwargs,
-            attention_mask=attention_mask,
-            encoder_attention_mask=encoder_attention_mask,
-            class_labels=None,
-        )
-
-        controlnet_states_left = self.after_proj(controlnet_states_down)
-
-        return controlnet_states_left, controlnet_states_down
-
-
-class PixArtControlNetAdapterModel(ModelMixin, ConfigMixin):
-    # N=13, as specified in the paper https://arxiv.org/html/2401.05252v1/#S4 ControlNet-Transformer
-    @register_to_config
-    def __init__(self, num_layers=13) -> None:
-        super().__init__()
-
-        self.num_layers = num_layers
-
-        self.controlnet_blocks = nn.ModuleList(
-            [PixArtControlNetAdapterBlock(block_index=i) for i in range(num_layers)]
-        )
-
-    @classmethod
-    def from_transformer(cls, transformer: PixArtTransformer2DModel):
-        control_net = PixArtControlNetAdapterModel()
-
-        # copied the specified number of blocks from the transformer
-        for depth in range(control_net.num_layers):
-            control_net.controlnet_blocks[depth].transformer_block.load_state_dict(
-                transformer.transformer_blocks[depth].state_dict()
-            )
-
-        return control_net
-
-    def train(self, mode: bool = True):
-        for block in self.controlnet_blocks:
-            block.train(mode)
-
-
-class PixArtControlNetTransformerModel(ModelMixin, ConfigMixin):
-    def __init__(
-        self,
-        transformer: PixArtTransformer2DModel,
-        controlnet: PixArtControlNetAdapterModel,
-        blocks_num=13,
-        init_from_transformer=False,
-        training=False,
-    ):
-        super().__init__()
-
-        self.blocks_num = blocks_num
-        self.gradient_checkpointing = False
-        self.register_to_config(**transformer.config)
-        self.training = training
-
-        if init_from_transformer:
-            # copies the specified number of blocks from the transformer
-            controlnet.from_transformer(transformer, self.blocks_num)
-
-        self.transformer = transformer
-        self.controlnet = controlnet
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if hasattr(module, "gradient_checkpointing"):
-            module.gradient_checkpointing = value
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        timestep: Optional[torch.LongTensor] = None,
-        controlnet_cond: Optional[torch.Tensor] = None,
-        added_cond_kwargs: Dict[str, torch.Tensor] = None,
-        cross_attention_kwargs: Dict[str, Any] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        encoder_attention_mask: Optional[torch.Tensor] = None,
-        return_dict: bool = True,
-    ):
-        if self.transformer.use_additional_conditions and added_cond_kwargs is None:
-            raise ValueError("`added_cond_kwargs` cannot be None when using additional conditions for `adaln_single`.")
-
-        # ensure attention_mask is a bias, and give it a singleton query_tokens dimension.
-        #   we may have done this conversion already, e.g. if we came here via UNet2DConditionModel#forward.
-        #   we can tell by counting dims; if ndim == 2: it's a mask rather than a bias.
-        # expects mask of shape:
-        #   [batch, key_tokens]
-        # adds singleton query_tokens dimension:
-        #   [batch,                    1, key_tokens]
-        # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes:
-        #   [batch,  heads, query_tokens, key_tokens] (e.g. torch sdp attn)
-        #   [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn)
-        if attention_mask is not None and attention_mask.ndim == 2:
-            # assume that mask is expressed as:
-            #   (1 = keep,      0 = discard)
-            # convert mask into a bias that can be added to attention scores:
-            #       (keep = +0,     discard = -10000.0)
-            attention_mask = (1 - attention_mask.to(hidden_states.dtype)) * -10000.0
-            attention_mask = attention_mask.unsqueeze(1)
-
-        # convert encoder_attention_mask to a bias the same way we do for attention_mask
-        if encoder_attention_mask is not None and encoder_attention_mask.ndim == 2:
-            encoder_attention_mask = (1 - encoder_attention_mask.to(hidden_states.dtype)) * -10000.0
-            encoder_attention_mask = encoder_attention_mask.unsqueeze(1)
-
-        # 1. Input
-        batch_size = hidden_states.shape[0]
-        height, width = (
-            hidden_states.shape[-2] // self.transformer.config.patch_size,
-            hidden_states.shape[-1] // self.transformer.config.patch_size,
-        )
-        hidden_states = self.transformer.pos_embed(hidden_states)
-
-        timestep, embedded_timestep = self.transformer.adaln_single(
-            timestep, added_cond_kwargs, batch_size=batch_size, hidden_dtype=hidden_states.dtype
-        )
-
-        if self.transformer.caption_projection is not None:
-            encoder_hidden_states = self.transformer.caption_projection(encoder_hidden_states)
-            encoder_hidden_states = encoder_hidden_states.view(batch_size, -1, hidden_states.shape[-1])
-
-        controlnet_states_down = None
-        if controlnet_cond is not None:
-            controlnet_states_down = self.transformer.pos_embed(controlnet_cond)
-
-        # 2. Blocks
-        for block_index, block in enumerate(self.transformer.transformer_blocks):
-            if self.training and self.gradient_checkpointing:
-                # rc todo: for training and gradient checkpointing
-                print("Gradient checkpointing is not supported for the controlnet transformer model, yet.")
-                exit(1)
-
-                def create_custom_forward(module, return_dict=None):
-                    def custom_forward(*inputs):
-                        if return_dict is not None:
-                            return module(*inputs, return_dict=return_dict)
-                        else:
-                            return module(*inputs)
-
-                    return custom_forward
-
-                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
-                hidden_states = torch.utils.checkpoint.checkpoint(
-                    create_custom_forward(block),
-                    hidden_states,
-                    attention_mask,
-                    encoder_hidden_states,
-                    encoder_attention_mask,
-                    timestep,
-                    cross_attention_kwargs,
-                    None,
-                    **ckpt_kwargs,
-                )
-            else:
-                # the control nets are only used for the blocks 1 to self.blocks_num
-                if block_index > 0 and block_index <= self.blocks_num and controlnet_states_down is not None:
-                    controlnet_states_left, controlnet_states_down = self.controlnet.controlnet_blocks[
-                        block_index - 1
-                    ](
-                        hidden_states=hidden_states,  # used only in the first block
-                        controlnet_states=controlnet_states_down,
-                        encoder_hidden_states=encoder_hidden_states,
-                        timestep=timestep,
-                        added_cond_kwargs=added_cond_kwargs,
-                        cross_attention_kwargs=cross_attention_kwargs,
-                        attention_mask=attention_mask,
-                        encoder_attention_mask=encoder_attention_mask,
-                    )
-
-                    hidden_states = hidden_states + controlnet_states_left
-
-                hidden_states = block(
-                    hidden_states,
-                    attention_mask=attention_mask,
-                    encoder_hidden_states=encoder_hidden_states,
-                    encoder_attention_mask=encoder_attention_mask,
-                    timestep=timestep,
-                    cross_attention_kwargs=cross_attention_kwargs,
-                    class_labels=None,
-                )
-
-        # 3. Output
-        shift, scale = (
-            self.transformer.scale_shift_table[None]
-            + embedded_timestep[:, None].to(self.transformer.scale_shift_table.device)
-        ).chunk(2, dim=1)
-        hidden_states = self.transformer.norm_out(hidden_states)
-        # Modulation
-        hidden_states = hidden_states * (1 + scale.to(hidden_states.device)) + shift.to(hidden_states.device)
-        hidden_states = self.transformer.proj_out(hidden_states)
-        hidden_states = hidden_states.squeeze(1)
-
-        # unpatchify
-        hidden_states = hidden_states.reshape(
-            shape=(
-                -1,
-                height,
-                width,
-                self.transformer.config.patch_size,
-                self.transformer.config.patch_size,
-                self.transformer.out_channels,
-            )
-        )
-        hidden_states = torch.einsum("nhwpqc->nchpwq", hidden_states)
-        output = hidden_states.reshape(
-            shape=(
-                -1,
-                self.transformer.out_channels,
-                height * self.transformer.config.patch_size,
-                width * self.transformer.config.patch_size,
-            )
-        )
-
-        if not return_dict:
-            return (output,)
-
-        return Transformer2DModelOutput(sample=output)
@@ -1,6 +0,0 @@
-transformers
-SentencePiece
-torchvision
-controlnet-aux
-datasets
-# wandb
@@ -1,75 +0,0 @@
-import torch
-import torchvision.transforms as T
-from controlnet_aux import HEDdetector
-
-from diffusers.utils import load_image
-from examples.research_projects.pixart.controlnet_pixart_alpha import PixArtControlNetAdapterModel
-from examples.research_projects.pixart.pipeline_pixart_alpha_controlnet import PixArtAlphaControlnetPipeline
-
-
-controlnet_repo_id = "raulc0399/pixart-alpha-hed-controlnet"
-
-weight_dtype = torch.float16
-image_size = 1024
-
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-torch.manual_seed(0)
-
-# load controlnet
-controlnet = PixArtControlNetAdapterModel.from_pretrained(
-    controlnet_repo_id,
-    torch_dtype=weight_dtype,
-    use_safetensors=True,
-).to(device)
-
-pipe = PixArtAlphaControlnetPipeline.from_pretrained(
-    "PixArt-alpha/PixArt-XL-2-1024-MS",
-    controlnet=controlnet,
-    torch_dtype=weight_dtype,
-    use_safetensors=True,
-).to(device)
-
-images_path = "images"
-control_image_file = "0_7.jpg"
-
-# prompt = "cinematic photo of superman in action . 35mm photograph, film, bokeh, professional, 4k, highly detailed"
-# prompt = "yellow modern car, city in background, beautiful rainy day"
-# prompt = "modern villa, clear sky, suny day . 35mm photograph, film, bokeh, professional, 4k, highly detailed"
-# prompt = "robot dog toy in park . 35mm photograph, film, bokeh, professional, 4k, highly detailed"
-# prompt = "purple car, on highway, beautiful sunny day"
-# prompt = "realistical photo of a loving couple standing in the open kitchen of the living room, cooking ."
-prompt = "battleship in space, galaxy in background"
-
-control_image_name = control_image_file.split(".")[0]
-
-control_image = load_image(f"{images_path}/{control_image_file}")
-print(control_image.size)
-height, width = control_image.size
-
-hed = HEDdetector.from_pretrained("lllyasviel/Annotators")
-
-condition_transform = T.Compose(
-    [
-        T.Lambda(lambda img: img.convert("RGB")),
-        T.CenterCrop([image_size, image_size]),
-    ]
-)
-
-control_image = condition_transform(control_image)
-hed_edge = hed(control_image, detect_resolution=image_size, image_resolution=image_size)
-
-hed_edge.save(f"{images_path}/{control_image_name}_hed.jpg")
-
-# run pipeline
-with torch.no_grad():
-    out = pipe(
-        prompt=prompt,
-        image=hed_edge,
-        num_inference_steps=14,
-        guidance_scale=4.5,
-        height=image_size,
-        width=image_size,
-    )
-
-    out.images[0].save(f"{images_path}//{control_image_name}_output.jpg")
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-# run
-# accelerate config
-
-# check with
-# accelerate env
-
-export MODEL_DIR="PixArt-alpha/PixArt-XL-2-512x512"
-export OUTPUT_DIR="output/pixart-controlnet-hf-diffusers-test"
-
-accelerate launch ./train_pixart_controlnet_hf.py --mixed_precision="fp16" \
- --pretrained_model_name_or_path=$MODEL_DIR \
- --output_dir=$OUTPUT_DIR \
- --dataset_name=fusing/fill50k \
- --resolution=512 \
- --learning_rate=1e-5 \
- --train_batch_size=1 \
- --gradient_accumulation_steps=4 \
- --report_to="wandb" \
- --seed=42 \
- --dataloader_num_workers=8
-#  --lr_scheduler="cosine" --lr_warmup_steps=0 \
@@ -229,11 +229,11 @@ class PromptDiffusionControlNetModel(ControlNetModel):
                In this mode, the ControlNet encoder tries its best to recognize the input content of the input even if
                you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended.
            return_dict (`bool`, defaults to `True`):
-                Whether or not to return a [`~models.controlnets.controlnet.ControlNetOutput`] instead of a plain tuple.
+                Whether or not to return a [`~models.controlnet.ControlNetOutput`] instead of a plain tuple.

        Returns:
-            [`~models.controlnets.controlnet.ControlNetOutput`] **or** `tuple`:
-                If `return_dict` is `True`, a [`~models.controlnets.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
+            [`~models.controlnet.ControlNetOutput`] **or** `tuple`:
+                If `return_dict` is `True`, a [`~models.controlnet.ControlNetOutput`] is returned, otherwise a tuple is
                returned where the first element is the sample tensor.
        """
        # check channel order
@@ -1475,6 +1475,7 @@ def main(args):

        optimizer = optimizer_class(
            params_to_optimize,
+            lr=args.learning_rate,
            betas=(args.adam_beta1, args.adam_beta2),
            beta3=args.prodigy_beta3,
            weight_decay=args.adam_weight_decay,
@@ -60,7 +60,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -57,7 +57,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -49,7 +49,7 @@ from diffusers.utils import check_min_version


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = logging.getLogger(__name__)

@@ -56,7 +56,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -68,7 +68,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)
 if is_torch_npu_available():
@@ -55,7 +55,7 @@ from diffusers.utils.torch_utils import is_compiled_module


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)
 if is_torch_npu_available():
@@ -81,7 +81,7 @@ else:


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -56,7 +56,7 @@ else:
 # ------------------------------------------------------------------------------

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = logging.getLogger(__name__)

@@ -76,7 +76,7 @@ else:


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__)

@@ -29,7 +29,7 @@ from diffusers.utils.import_utils import is_xformers_available


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -50,7 +50,7 @@ if is_wandb_available():
    import wandb

 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -50,7 +50,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -51,7 +51,7 @@ if is_wandb_available():


 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.32.0.dev0")
+check_min_version("0.31.0.dev0")

 logger = get_logger(__name__, log_level="INFO")

@@ -1,461 +0,0 @@
-import argparse
-from contextlib import nullcontext
-
-import torch
-from accelerate import init_empty_weights
-from safetensors.torch import load_file
-from transformers import T5EncoderModel, T5Tokenizer
-
-from diffusers import AutoencoderKLMochi, FlowMatchEulerDiscreteScheduler, MochiPipeline, MochiTransformer3DModel
-from diffusers.utils.import_utils import is_accelerate_available
-
-
-CTX = init_empty_weights if is_accelerate_available else nullcontext
-
-TOKENIZER_MAX_LENGTH = 256
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--transformer_checkpoint_path", default=None, type=str)
-parser.add_argument("--vae_encoder_checkpoint_path", default=None, type=str)
-parser.add_argument("--vae_decoder_checkpoint_path", default=None, type=str)
-parser.add_argument("--output_path", required=True, type=str)
-parser.add_argument("--push_to_hub", action="store_true", default=False, help="Whether to push to HF Hub after saving")
-parser.add_argument("--text_encoder_cache_dir", type=str, default=None, help="Path to text encoder cache directory")
-parser.add_argument("--dtype", type=str, default=None)
-
-args = parser.parse_args()
-
-
-# This is specific to `AdaLayerNormContinuous`:
-# Diffusers implementation split the linear projection into the scale, shift while Mochi split it into shift, scale
-def swap_scale_shift(weight, dim):
-    shift, scale = weight.chunk(2, dim=0)
-    new_weight = torch.cat([scale, shift], dim=0)
-    return new_weight
-
-
-def swap_proj_gate(weight):
-    proj, gate = weight.chunk(2, dim=0)
-    new_weight = torch.cat([gate, proj], dim=0)
-    return new_weight
-
-
-def convert_mochi_transformer_checkpoint_to_diffusers(ckpt_path):
-    original_state_dict = load_file(ckpt_path, device="cpu")
-    new_state_dict = {}
-
-    # Convert patch_embed
-    new_state_dict["patch_embed.proj.weight"] = original_state_dict.pop("x_embedder.proj.weight")
-    new_state_dict["patch_embed.proj.bias"] = original_state_dict.pop("x_embedder.proj.bias")
-
-    # Convert time_embed
-    new_state_dict["time_embed.timestep_embedder.linear_1.weight"] = original_state_dict.pop("t_embedder.mlp.0.weight")
-    new_state_dict["time_embed.timestep_embedder.linear_1.bias"] = original_state_dict.pop("t_embedder.mlp.0.bias")
-    new_state_dict["time_embed.timestep_embedder.linear_2.weight"] = original_state_dict.pop("t_embedder.mlp.2.weight")
-    new_state_dict["time_embed.timestep_embedder.linear_2.bias"] = original_state_dict.pop("t_embedder.mlp.2.bias")
-    new_state_dict["time_embed.pooler.to_kv.weight"] = original_state_dict.pop("t5_y_embedder.to_kv.weight")
-    new_state_dict["time_embed.pooler.to_kv.bias"] = original_state_dict.pop("t5_y_embedder.to_kv.bias")
-    new_state_dict["time_embed.pooler.to_q.weight"] = original_state_dict.pop("t5_y_embedder.to_q.weight")
-    new_state_dict["time_embed.pooler.to_q.bias"] = original_state_dict.pop("t5_y_embedder.to_q.bias")
-    new_state_dict["time_embed.pooler.to_out.weight"] = original_state_dict.pop("t5_y_embedder.to_out.weight")
-    new_state_dict["time_embed.pooler.to_out.bias"] = original_state_dict.pop("t5_y_embedder.to_out.bias")
-    new_state_dict["time_embed.caption_proj.weight"] = original_state_dict.pop("t5_yproj.weight")
-    new_state_dict["time_embed.caption_proj.bias"] = original_state_dict.pop("t5_yproj.bias")
-
-    # Convert transformer blocks
-    num_layers = 48
-    for i in range(num_layers):
-        block_prefix = f"transformer_blocks.{i}."
-        old_prefix = f"blocks.{i}."
-
-        # norm1
-        new_state_dict[block_prefix + "norm1.linear.weight"] = original_state_dict.pop(old_prefix + "mod_x.weight")
-        new_state_dict[block_prefix + "norm1.linear.bias"] = original_state_dict.pop(old_prefix + "mod_x.bias")
-        if i < num_layers - 1:
-            new_state_dict[block_prefix + "norm1_context.linear.weight"] = original_state_dict.pop(
-                old_prefix + "mod_y.weight"
-            )
-            new_state_dict[block_prefix + "norm1_context.linear.bias"] = original_state_dict.pop(
-                old_prefix + "mod_y.bias"
-            )
-        else:
-            new_state_dict[block_prefix + "norm1_context.linear_1.weight"] = original_state_dict.pop(
-                old_prefix + "mod_y.weight"
-            )
-            new_state_dict[block_prefix + "norm1_context.linear_1.bias"] = original_state_dict.pop(
-                old_prefix + "mod_y.bias"
-            )
-
-        # Visual attention
-        qkv_weight = original_state_dict.pop(old_prefix + "attn.qkv_x.weight")
-        q, k, v = qkv_weight.chunk(3, dim=0)
-
-        new_state_dict[block_prefix + "attn1.to_q.weight"] = q
-        new_state_dict[block_prefix + "attn1.to_k.weight"] = k
-        new_state_dict[block_prefix + "attn1.to_v.weight"] = v
-        new_state_dict[block_prefix + "attn1.norm_q.weight"] = original_state_dict.pop(
-            old_prefix + "attn.q_norm_x.weight"
-        )
-        new_state_dict[block_prefix + "attn1.norm_k.weight"] = original_state_dict.pop(
-            old_prefix + "attn.k_norm_x.weight"
-        )
-        new_state_dict[block_prefix + "attn1.to_out.0.weight"] = original_state_dict.pop(
-            old_prefix + "attn.proj_x.weight"
-        )
-        new_state_dict[block_prefix + "attn1.to_out.0.bias"] = original_state_dict.pop(old_prefix + "attn.proj_x.bias")
-
-        # Context attention
-        qkv_weight = original_state_dict.pop(old_prefix + "attn.qkv_y.weight")
-        q, k, v = qkv_weight.chunk(3, dim=0)
-
-        new_state_dict[block_prefix + "attn1.add_q_proj.weight"] = q
-        new_state_dict[block_prefix + "attn1.add_k_proj.weight"] = k
-        new_state_dict[block_prefix + "attn1.add_v_proj.weight"] = v
-        new_state_dict[block_prefix + "attn1.norm_added_q.weight"] = original_state_dict.pop(
-            old_prefix + "attn.q_norm_y.weight"
-        )
-        new_state_dict[block_prefix + "attn1.norm_added_k.weight"] = original_state_dict.pop(
-            old_prefix + "attn.k_norm_y.weight"
-        )
-        if i < num_layers - 1:
-            new_state_dict[block_prefix + "attn1.to_add_out.weight"] = original_state_dict.pop(
-                old_prefix + "attn.proj_y.weight"
-            )
-            new_state_dict[block_prefix + "attn1.to_add_out.bias"] = original_state_dict.pop(
-                old_prefix + "attn.proj_y.bias"
-            )
-
-        # MLP
-        new_state_dict[block_prefix + "ff.net.0.proj.weight"] = swap_proj_gate(
-            original_state_dict.pop(old_prefix + "mlp_x.w1.weight")
-        )
-        new_state_dict[block_prefix + "ff.net.2.weight"] = original_state_dict.pop(old_prefix + "mlp_x.w2.weight")
-        if i < num_layers - 1:
-            new_state_dict[block_prefix + "ff_context.net.0.proj.weight"] = swap_proj_gate(
-                original_state_dict.pop(old_prefix + "mlp_y.w1.weight")
-            )
-            new_state_dict[block_prefix + "ff_context.net.2.weight"] = original_state_dict.pop(
-                old_prefix + "mlp_y.w2.weight"
-            )
-
-    # Output layers
-    new_state_dict["norm_out.linear.weight"] = swap_scale_shift(
-        original_state_dict.pop("final_layer.mod.weight"), dim=0
-    )
-    new_state_dict["norm_out.linear.bias"] = swap_scale_shift(original_state_dict.pop("final_layer.mod.bias"), dim=0)
-    new_state_dict["proj_out.weight"] = original_state_dict.pop("final_layer.linear.weight")
-    new_state_dict["proj_out.bias"] = original_state_dict.pop("final_layer.linear.bias")
-
-    new_state_dict["pos_frequencies"] = original_state_dict.pop("pos_frequencies")
-
-    print("Remaining Keys:", original_state_dict.keys())
-
-    return new_state_dict
-
-
-def convert_mochi_vae_state_dict_to_diffusers(encoder_ckpt_path, decoder_ckpt_path):
-    encoder_state_dict = load_file(encoder_ckpt_path, device="cpu")
-    decoder_state_dict = load_file(decoder_ckpt_path, device="cpu")
-    new_state_dict = {}
-
-    # ==== Decoder =====
-    prefix = "decoder."
-
-    # Convert conv_in
-    new_state_dict[f"{prefix}conv_in.weight"] = decoder_state_dict.pop("blocks.0.0.weight")
-    new_state_dict[f"{prefix}conv_in.bias"] = decoder_state_dict.pop("blocks.0.0.bias")
-
-    # Convert block_in (MochiMidBlock3D)
-    for i in range(3):  # layers_per_block[-1] = 3
-        new_state_dict[f"{prefix}block_in.resnets.{i}.norm1.norm_layer.weight"] = decoder_state_dict.pop(
-            f"blocks.0.{i+1}.stack.0.weight"
-        )
-        new_state_dict[f"{prefix}block_in.resnets.{i}.norm1.norm_layer.bias"] = decoder_state_dict.pop(
-            f"blocks.0.{i+1}.stack.0.bias"
-        )
-        new_state_dict[f"{prefix}block_in.resnets.{i}.conv1.conv.weight"] = decoder_state_dict.pop(
-            f"blocks.0.{i+1}.stack.2.weight"
-        )
-        new_state_dict[f"{prefix}block_in.resnets.{i}.conv1.conv.bias"] = decoder_state_dict.pop(
-            f"blocks.0.{i+1}.stack.2.bias"
-        )
-        new_state_dict[f"{prefix}block_in.resnets.{i}.norm2.norm_layer.weight"] = decoder_state_dict.pop(
-            f"blocks.0.{i+1}.stack.3.weight"
-        )
-        new_state_dict[f"{prefix}block_in.resnets.{i}.norm2.norm_layer.bias"] = decoder_state_dict.pop(
-            f"blocks.0.{i+1}.stack.3.bias"
-        )
-        new_state_dict[f"{prefix}block_in.resnets.{i}.conv2.conv.weight"] = decoder_state_dict.pop(
-            f"blocks.0.{i+1}.stack.5.weight"
-        )
-        new_state_dict[f"{prefix}block_in.resnets.{i}.conv2.conv.bias"] = decoder_state_dict.pop(
-            f"blocks.0.{i+1}.stack.5.bias"
-        )
-
-    # Convert up_blocks (MochiUpBlock3D)
-    down_block_layers = [6, 4, 3]  # layers_per_block[-2], layers_per_block[-3], layers_per_block[-4]
-    for block in range(3):
-        for i in range(down_block_layers[block]):
-            new_state_dict[f"{prefix}up_blocks.{block}.resnets.{i}.norm1.norm_layer.weight"] = decoder_state_dict.pop(
-                f"blocks.{block+1}.blocks.{i}.stack.0.weight"
-            )
-            new_state_dict[f"{prefix}up_blocks.{block}.resnets.{i}.norm1.norm_layer.bias"] = decoder_state_dict.pop(
-                f"blocks.{block+1}.blocks.{i}.stack.0.bias"
-            )
-            new_state_dict[f"{prefix}up_blocks.{block}.resnets.{i}.conv1.conv.weight"] = decoder_state_dict.pop(
-                f"blocks.{block+1}.blocks.{i}.stack.2.weight"
-            )
-            new_state_dict[f"{prefix}up_blocks.{block}.resnets.{i}.conv1.conv.bias"] = decoder_state_dict.pop(
-                f"blocks.{block+1}.blocks.{i}.stack.2.bias"
-            )
-            new_state_dict[f"{prefix}up_blocks.{block}.resnets.{i}.norm2.norm_layer.weight"] = decoder_state_dict.pop(
-                f"blocks.{block+1}.blocks.{i}.stack.3.weight"
-            )
-            new_state_dict[f"{prefix}up_blocks.{block}.resnets.{i}.norm2.norm_layer.bias"] = decoder_state_dict.pop(
-                f"blocks.{block+1}.blocks.{i}.stack.3.bias"
-            )
-            new_state_dict[f"{prefix}up_blocks.{block}.resnets.{i}.conv2.conv.weight"] = decoder_state_dict.pop(
-                f"blocks.{block+1}.blocks.{i}.stack.5.weight"
-            )
-            new_state_dict[f"{prefix}up_blocks.{block}.resnets.{i}.conv2.conv.bias"] = decoder_state_dict.pop(
-                f"blocks.{block+1}.blocks.{i}.stack.5.bias"
-            )
-        new_state_dict[f"{prefix}up_blocks.{block}.proj.weight"] = decoder_state_dict.pop(
-            f"blocks.{block+1}.proj.weight"
-        )
-        new_state_dict[f"{prefix}up_blocks.{block}.proj.bias"] = decoder_state_dict.pop(f"blocks.{block+1}.proj.bias")
-
-    # Convert block_out (MochiMidBlock3D)
-    for i in range(3):  # layers_per_block[0] = 3
-        new_state_dict[f"{prefix}block_out.resnets.{i}.norm1.norm_layer.weight"] = decoder_state_dict.pop(
-            f"blocks.4.{i}.stack.0.weight"
-        )
-        new_state_dict[f"{prefix}block_out.resnets.{i}.norm1.norm_layer.bias"] = decoder_state_dict.pop(
-            f"blocks.4.{i}.stack.0.bias"
-        )
-        new_state_dict[f"{prefix}block_out.resnets.{i}.conv1.conv.weight"] = decoder_state_dict.pop(
-            f"blocks.4.{i}.stack.2.weight"
-        )
-        new_state_dict[f"{prefix}block_out.resnets.{i}.conv1.conv.bias"] = decoder_state_dict.pop(
-            f"blocks.4.{i}.stack.2.bias"
-        )
-        new_state_dict[f"{prefix}block_out.resnets.{i}.norm2.norm_layer.weight"] = decoder_state_dict.pop(
-            f"blocks.4.{i}.stack.3.weight"
-        )
-        new_state_dict[f"{prefix}block_out.resnets.{i}.norm2.norm_layer.bias"] = decoder_state_dict.pop(
-            f"blocks.4.{i}.stack.3.bias"
-        )
-        new_state_dict[f"{prefix}block_out.resnets.{i}.conv2.conv.weight"] = decoder_state_dict.pop(
-            f"blocks.4.{i}.stack.5.weight"
-        )
-        new_state_dict[f"{prefix}block_out.resnets.{i}.conv2.conv.bias"] = decoder_state_dict.pop(
-            f"blocks.4.{i}.stack.5.bias"
-        )
-
-    # Convert proj_out (Conv1x1 ~= nn.Linear)
-    new_state_dict[f"{prefix}proj_out.weight"] = decoder_state_dict.pop("output_proj.weight")
-    new_state_dict[f"{prefix}proj_out.bias"] = decoder_state_dict.pop("output_proj.bias")
-
-    print("Remaining Decoder Keys:", decoder_state_dict.keys())
-
-    # ==== Encoder =====
-    prefix = "encoder."
-
-    new_state_dict[f"{prefix}proj_in.weight"] = encoder_state_dict.pop("layers.0.weight")
-    new_state_dict[f"{prefix}proj_in.bias"] = encoder_state_dict.pop("layers.0.bias")
-
-    # Convert block_in (MochiMidBlock3D)
-    for i in range(3):  # layers_per_block[0] = 3
-        new_state_dict[f"{prefix}block_in.resnets.{i}.norm1.norm_layer.weight"] = encoder_state_dict.pop(
-            f"layers.{i+1}.stack.0.weight"
-        )
-        new_state_dict[f"{prefix}block_in.resnets.{i}.norm1.norm_layer.bias"] = encoder_state_dict.pop(
-            f"layers.{i+1}.stack.0.bias"
-        )
-        new_state_dict[f"{prefix}block_in.resnets.{i}.conv1.conv.weight"] = encoder_state_dict.pop(
-            f"layers.{i+1}.stack.2.weight"
-        )
-        new_state_dict[f"{prefix}block_in.resnets.{i}.conv1.conv.bias"] = encoder_state_dict.pop(
-            f"layers.{i+1}.stack.2.bias"
-        )
-        new_state_dict[f"{prefix}block_in.resnets.{i}.norm2.norm_layer.weight"] = encoder_state_dict.pop(
-            f"layers.{i+1}.stack.3.weight"
-        )
-        new_state_dict[f"{prefix}block_in.resnets.{i}.norm2.norm_layer.bias"] = encoder_state_dict.pop(
-            f"layers.{i+1}.stack.3.bias"
-        )
-        new_state_dict[f"{prefix}block_in.resnets.{i}.conv2.conv.weight"] = encoder_state_dict.pop(
-            f"layers.{i+1}.stack.5.weight"
-        )
-        new_state_dict[f"{prefix}block_in.resnets.{i}.conv2.conv.bias"] = encoder_state_dict.pop(
-            f"layers.{i+1}.stack.5.bias"
-        )
-
-    # Convert down_blocks (MochiDownBlock3D)
-    down_block_layers = [3, 4, 6]  # layers_per_block[1], layers_per_block[2], layers_per_block[3]
-    for block in range(3):
-        new_state_dict[f"{prefix}down_blocks.{block}.conv_in.conv.weight"] = encoder_state_dict.pop(
-            f"layers.{block+4}.layers.0.weight"
-        )
-        new_state_dict[f"{prefix}down_blocks.{block}.conv_in.conv.bias"] = encoder_state_dict.pop(
-            f"layers.{block+4}.layers.0.bias"
-        )
-
-        for i in range(down_block_layers[block]):
-            # Convert resnets
-            new_state_dict[
-                f"{prefix}down_blocks.{block}.resnets.{i}.norm1.norm_layer.weight"
-            ] = encoder_state_dict.pop(f"layers.{block+4}.layers.{i+1}.stack.0.weight")
-            new_state_dict[f"{prefix}down_blocks.{block}.resnets.{i}.norm1.norm_layer.bias"] = encoder_state_dict.pop(
-                f"layers.{block+4}.layers.{i+1}.stack.0.bias"
-            )
-            new_state_dict[f"{prefix}down_blocks.{block}.resnets.{i}.conv1.conv.weight"] = encoder_state_dict.pop(
-                f"layers.{block+4}.layers.{i+1}.stack.2.weight"
-            )
-            new_state_dict[f"{prefix}down_blocks.{block}.resnets.{i}.conv1.conv.bias"] = encoder_state_dict.pop(
-                f"layers.{block+4}.layers.{i+1}.stack.2.bias"
-            )
-            new_state_dict[
-                f"{prefix}down_blocks.{block}.resnets.{i}.norm2.norm_layer.weight"
-            ] = encoder_state_dict.pop(f"layers.{block+4}.layers.{i+1}.stack.3.weight")
-            new_state_dict[f"{prefix}down_blocks.{block}.resnets.{i}.norm2.norm_layer.bias"] = encoder_state_dict.pop(
-                f"layers.{block+4}.layers.{i+1}.stack.3.bias"
-            )
-            new_state_dict[f"{prefix}down_blocks.{block}.resnets.{i}.conv2.conv.weight"] = encoder_state_dict.pop(
-                f"layers.{block+4}.layers.{i+1}.stack.5.weight"
-            )
-            new_state_dict[f"{prefix}down_blocks.{block}.resnets.{i}.conv2.conv.bias"] = encoder_state_dict.pop(
-                f"layers.{block+4}.layers.{i+1}.stack.5.bias"
-            )
-
-            # Convert attentions
-            qkv_weight = encoder_state_dict.pop(f"layers.{block+4}.layers.{i+1}.attn_block.attn.qkv.weight")
-            q, k, v = qkv_weight.chunk(3, dim=0)
-
-            new_state_dict[f"{prefix}down_blocks.{block}.attentions.{i}.to_q.weight"] = q
-            new_state_dict[f"{prefix}down_blocks.{block}.attentions.{i}.to_k.weight"] = k
-            new_state_dict[f"{prefix}down_blocks.{block}.attentions.{i}.to_v.weight"] = v
-            new_state_dict[f"{prefix}down_blocks.{block}.attentions.{i}.to_out.0.weight"] = encoder_state_dict.pop(
-                f"layers.{block+4}.layers.{i+1}.attn_block.attn.out.weight"
-            )
-            new_state_dict[f"{prefix}down_blocks.{block}.attentions.{i}.to_out.0.bias"] = encoder_state_dict.pop(
-                f"layers.{block+4}.layers.{i+1}.attn_block.attn.out.bias"
-            )
-            new_state_dict[f"{prefix}down_blocks.{block}.norms.{i}.norm_layer.weight"] = encoder_state_dict.pop(
-                f"layers.{block+4}.layers.{i+1}.attn_block.norm.weight"
-            )
-            new_state_dict[f"{prefix}down_blocks.{block}.norms.{i}.norm_layer.bias"] = encoder_state_dict.pop(
-                f"layers.{block+4}.layers.{i+1}.attn_block.norm.bias"
-            )
-
-    # Convert block_out (MochiMidBlock3D)
-    for i in range(3):  # layers_per_block[-1] = 3
-        # Convert resnets
-        new_state_dict[f"{prefix}block_out.resnets.{i}.norm1.norm_layer.weight"] = encoder_state_dict.pop(
-            f"layers.{i+7}.stack.0.weight"
-        )
-        new_state_dict[f"{prefix}block_out.resnets.{i}.norm1.norm_layer.bias"] = encoder_state_dict.pop(
-            f"layers.{i+7}.stack.0.bias"
-        )
-        new_state_dict[f"{prefix}block_out.resnets.{i}.conv1.conv.weight"] = encoder_state_dict.pop(
-            f"layers.{i+7}.stack.2.weight"
-        )
-        new_state_dict[f"{prefix}block_out.resnets.{i}.conv1.conv.bias"] = encoder_state_dict.pop(
-            f"layers.{i+7}.stack.2.bias"
-        )
-        new_state_dict[f"{prefix}block_out.resnets.{i}.norm2.norm_layer.weight"] = encoder_state_dict.pop(
-            f"layers.{i+7}.stack.3.weight"
-        )
-        new_state_dict[f"{prefix}block_out.resnets.{i}.norm2.norm_layer.bias"] = encoder_state_dict.pop(
-            f"layers.{i+7}.stack.3.bias"
-        )
-        new_state_dict[f"{prefix}block_out.resnets.{i}.conv2.conv.weight"] = encoder_state_dict.pop(
-            f"layers.{i+7}.stack.5.weight"
-        )
-        new_state_dict[f"{prefix}block_out.resnets.{i}.conv2.conv.bias"] = encoder_state_dict.pop(
-            f"layers.{i+7}.stack.5.bias"
-        )
-
-        # Convert attentions
-        qkv_weight = encoder_state_dict.pop(f"layers.{i+7}.attn_block.attn.qkv.weight")
-        q, k, v = qkv_weight.chunk(3, dim=0)
-
-        new_state_dict[f"{prefix}block_out.attentions.{i}.to_q.weight"] = q
-        new_state_dict[f"{prefix}block_out.attentions.{i}.to_k.weight"] = k
-        new_state_dict[f"{prefix}block_out.attentions.{i}.to_v.weight"] = v
-        new_state_dict[f"{prefix}block_out.attentions.{i}.to_out.0.weight"] = encoder_state_dict.pop(
-            f"layers.{i+7}.attn_block.attn.out.weight"
-        )
-        new_state_dict[f"{prefix}block_out.attentions.{i}.to_out.0.bias"] = encoder_state_dict.pop(
-            f"layers.{i+7}.attn_block.attn.out.bias"
-        )
-        new_state_dict[f"{prefix}block_out.norms.{i}.norm_layer.weight"] = encoder_state_dict.pop(
-            f"layers.{i+7}.attn_block.norm.weight"
-        )
-        new_state_dict[f"{prefix}block_out.norms.{i}.norm_layer.bias"] = encoder_state_dict.pop(
-            f"layers.{i+7}.attn_block.norm.bias"
-        )
-
-    # Convert output layers
-    new_state_dict[f"{prefix}norm_out.norm_layer.weight"] = encoder_state_dict.pop("output_norm.weight")
-    new_state_dict[f"{prefix}norm_out.norm_layer.bias"] = encoder_state_dict.pop("output_norm.bias")
-    new_state_dict[f"{prefix}proj_out.weight"] = encoder_state_dict.pop("output_proj.weight")
-
-    print("Remaining Encoder Keys:", encoder_state_dict.keys())
-
-    return new_state_dict
-
-
-def main(args):
-    if args.dtype is None:
-        dtype = None
-    if args.dtype == "fp16":
-        dtype = torch.float16
-    elif args.dtype == "bf16":
-        dtype = torch.bfloat16
-    elif args.dtype == "fp32":
-        dtype = torch.float32
-    else:
-        raise ValueError(f"Unsupported dtype: {args.dtype}")
-
-    transformer = None
-    vae = None
-
-    if args.transformer_checkpoint_path is not None:
-        converted_transformer_state_dict = convert_mochi_transformer_checkpoint_to_diffusers(
-            args.transformer_checkpoint_path
-        )
-        transformer = MochiTransformer3DModel()
-        transformer.load_state_dict(converted_transformer_state_dict, strict=True)
-        if dtype is not None:
-            transformer = transformer.to(dtype=dtype)
-
-    if args.vae_encoder_checkpoint_path is not None and args.vae_decoder_checkpoint_path is not None:
-        vae = AutoencoderKLMochi(latent_channels=12, out_channels=3)
-        converted_vae_state_dict = convert_mochi_vae_state_dict_to_diffusers(
-            args.vae_encoder_checkpoint_path, args.vae_decoder_checkpoint_path
-        )
-        vae.load_state_dict(converted_vae_state_dict, strict=True)
-        if dtype is not None:
-            vae = vae.to(dtype=dtype)
-
-    text_encoder_id = "google/t5-v1_1-xxl"
-    tokenizer = T5Tokenizer.from_pretrained(text_encoder_id, model_max_length=TOKENIZER_MAX_LENGTH)
-    text_encoder = T5EncoderModel.from_pretrained(text_encoder_id, cache_dir=args.text_encoder_cache_dir)
-
-    # Apparently, the conversion does not work anymore without this :shrug:
-    for param in text_encoder.parameters():
-        param.data = param.data.contiguous()
-
-    pipe = MochiPipeline(
-        scheduler=FlowMatchEulerDiscreteScheduler(invert_sigmas=True),
-        vae=vae,
-        text_encoder=text_encoder,
-        tokenizer=tokenizer,
-        transformer=transformer,
-    )
-    pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB", push_to_hub=args.push_to_hub)
-
-
-if __name__ == "__main__":
-    main(args)
@@ -16,9 +16,10 @@ CTX = init_empty_weights if is_accelerate_available else nullcontext
 parser = argparse.ArgumentParser()
 parser.add_argument("--checkpoint_path", type=str)
 parser.add_argument("--output_path", type=str)
-parser.add_argument("--dtype", type=str)
+parser.add_argument("--dtype", type=str, default="fp16")

 args = parser.parse_args()
+dtype = torch.float16 if args.dtype == "fp16" else torch.float32


 def load_original_checkpoint(ckpt_path):
@@ -39,9 +40,7 @@ def swap_scale_shift(weight, dim):
    return new_weight


-def convert_sd3_transformer_checkpoint_to_diffusers(
-    original_state_dict, num_layers, caption_projection_dim, dual_attention_layers, has_qk_norm
-):
+def convert_sd3_transformer_checkpoint_to_diffusers(original_state_dict, num_layers, caption_projection_dim):
    converted_state_dict = {}

    # Positional and patch embeddings.
@@ -111,21 +110,6 @@ def convert_sd3_transformer_checkpoint_to_diffusers(
        converted_state_dict[f"transformer_blocks.{i}.attn.add_v_proj.weight"] = torch.cat([context_v])
        converted_state_dict[f"transformer_blocks.{i}.attn.add_v_proj.bias"] = torch.cat([context_v_bias])

-        # qk norm
-        if has_qk_norm:
-            converted_state_dict[f"transformer_blocks.{i}.attn.norm_q.weight"] = original_state_dict.pop(
-                f"joint_blocks.{i}.x_block.attn.ln_q.weight"
-            )
-            converted_state_dict[f"transformer_blocks.{i}.attn.norm_k.weight"] = original_state_dict.pop(
-                f"joint_blocks.{i}.x_block.attn.ln_k.weight"
-            )
-            converted_state_dict[f"transformer_blocks.{i}.attn.norm_added_q.weight"] = original_state_dict.pop(
-                f"joint_blocks.{i}.context_block.attn.ln_q.weight"
-            )
-            converted_state_dict[f"transformer_blocks.{i}.attn.norm_added_k.weight"] = original_state_dict.pop(
-                f"joint_blocks.{i}.context_block.attn.ln_k.weight"
-            )
-
        # output projections.
        converted_state_dict[f"transformer_blocks.{i}.attn.to_out.0.weight"] = original_state_dict.pop(
            f"joint_blocks.{i}.x_block.attn.proj.weight"
@@ -141,39 +125,6 @@ def convert_sd3_transformer_checkpoint_to_diffusers(
                f"joint_blocks.{i}.context_block.attn.proj.bias"
            )

-        # attn2
-        if i in dual_attention_layers:
-            # Q, K, V
-            sample_q2, sample_k2, sample_v2 = torch.chunk(
-                original_state_dict.pop(f"joint_blocks.{i}.x_block.attn2.qkv.weight"), 3, dim=0
-            )
-            sample_q2_bias, sample_k2_bias, sample_v2_bias = torch.chunk(
-                original_state_dict.pop(f"joint_blocks.{i}.x_block.attn2.qkv.bias"), 3, dim=0
-            )
-            converted_state_dict[f"transformer_blocks.{i}.attn2.to_q.weight"] = torch.cat([sample_q2])
-            converted_state_dict[f"transformer_blocks.{i}.attn2.to_q.bias"] = torch.cat([sample_q2_bias])
-            converted_state_dict[f"transformer_blocks.{i}.attn2.to_k.weight"] = torch.cat([sample_k2])
-            converted_state_dict[f"transformer_blocks.{i}.attn2.to_k.bias"] = torch.cat([sample_k2_bias])
-            converted_state_dict[f"transformer_blocks.{i}.attn2.to_v.weight"] = torch.cat([sample_v2])
-            converted_state_dict[f"transformer_blocks.{i}.attn2.to_v.bias"] = torch.cat([sample_v2_bias])
-
-            # qk norm
-            if has_qk_norm:
-                converted_state_dict[f"transformer_blocks.{i}.attn2.norm_q.weight"] = original_state_dict.pop(
-                    f"joint_blocks.{i}.x_block.attn2.ln_q.weight"
-                )
-                converted_state_dict[f"transformer_blocks.{i}.attn2.norm_k.weight"] = original_state_dict.pop(
-                    f"joint_blocks.{i}.x_block.attn2.ln_k.weight"
-                )
-
-            # output projections.
-            converted_state_dict[f"transformer_blocks.{i}.attn2.to_out.0.weight"] = original_state_dict.pop(
-                f"joint_blocks.{i}.x_block.attn2.proj.weight"
-            )
-            converted_state_dict[f"transformer_blocks.{i}.attn2.to_out.0.bias"] = original_state_dict.pop(
-                f"joint_blocks.{i}.x_block.attn2.proj.bias"
-            )
-
        # norms.
        converted_state_dict[f"transformer_blocks.{i}.norm1.linear.weight"] = original_state_dict.pop(
            f"joint_blocks.{i}.x_block.adaLN_modulation.1.weight"
@@ -244,79 +195,25 @@ def is_vae_in_checkpoint(original_state_dict):
    )


-def get_attn2_layers(state_dict):
-    attn2_layers = []
-    for key in state_dict.keys():
-        if "attn2." in key:
-            # Extract the layer number from the key
-            layer_num = int(key.split(".")[1])
-            attn2_layers.append(layer_num)
-    return tuple(sorted(set(attn2_layers)))
-
-
-def get_pos_embed_max_size(state_dict):
-    num_patches = state_dict["pos_embed"].shape[1]
-    pos_embed_max_size = int(num_patches**0.5)
-    return pos_embed_max_size
-
-
-def get_caption_projection_dim(state_dict):
-    caption_projection_dim = state_dict["context_embedder.weight"].shape[0]
-    return caption_projection_dim
-
-
 def main(args):
    original_ckpt = load_original_checkpoint(args.checkpoint_path)
-    original_dtype = next(iter(original_ckpt.values())).dtype
-
-    # Initialize dtype with a default value
-    dtype = None
-
-    if args.dtype is None:
-        dtype = original_dtype
-    elif args.dtype == "fp16":
-        dtype = torch.float16
-    elif args.dtype == "bf16":
-        dtype = torch.bfloat16
-    elif args.dtype == "fp32":
-        dtype = torch.float32
-    else:
-        raise ValueError(f"Unsupported dtype: {args.dtype}")
-
-    if dtype != original_dtype:
-        print(
-            f"Checkpoint dtype {original_dtype} does not match requested dtype {dtype}. This can lead to unexpected results, proceed with caution."
-        )
-
    num_layers = list(set(int(k.split(".", 2)[1]) for k in original_ckpt if "joint_blocks" in k))[-1] + 1  # noqa: C401
-
-    caption_projection_dim = get_caption_projection_dim(original_ckpt)
-
-    # () for sd3.0; (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12) for sd3.5
-    attn2_layers = get_attn2_layers(original_ckpt)
-
-    # sd3.5 use qk norm("rms_norm")
-    has_qk_norm = any("ln_q" in key for key in original_ckpt.keys())
-
-    # sd3.5 2b use pox_embed_max_size=384 and sd3.0 and sd3.5 8b use 192
-    pos_embed_max_size = get_pos_embed_max_size(original_ckpt)
+    caption_projection_dim = 1536

    converted_transformer_state_dict = convert_sd3_transformer_checkpoint_to_diffusers(
-        original_ckpt, num_layers, caption_projection_dim, attn2_layers, has_qk_norm
+        original_ckpt, num_layers, caption_projection_dim
    )

    with CTX():
        transformer = SD3Transformer2DModel(
-            sample_size=128,
+            sample_size=64,
            patch_size=2,
            in_channels=16,
            joint_attention_dim=4096,
            num_layers=num_layers,
            caption_projection_dim=caption_projection_dim,
-            num_attention_heads=num_layers,
-            pos_embed_max_size=pos_embed_max_size,
-            qk_norm="rms_norm" if has_qk_norm else None,
-            dual_attention_layers=attn2_layers,
+            num_attention_heads=24,
+            pos_embed_max_size=192,
        )
    if is_accelerate_available():
        load_model_dict_into_meta(transformer, converted_transformer_state_dict)
@@ -130,7 +130,7 @@ _deps = [
    "regex!=2019.12.17",
    "requests",
    "tensorboard",
-    "torch>=1.4,<2.5.0",
+    "torch>=1.4",
    "torchvision",
    "transformers>=4.41.2",
    "urllib3<=2.0.0",
@@ -254,7 +254,7 @@ version_range_max = max(sys.version_info[1], 10) + 1

 setup(
    name="diffusers",
-    version="0.32.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="0.31.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    description="State-of-the-art diffusion in PyTorch and JAX.",
    long_description=open("README.md", "r", encoding="utf-8").read(),
    long_description_content_type="text/markdown",
@@ -1,4 +1,4 @@
-__version__ = "0.32.0.dev0"
+__version__ = "0.31.0.dev0"

 from typing import TYPE_CHECKING

@@ -31,7 +31,6 @@ _import_structure = {
    "loaders": ["FromOriginalModelMixin"],
    "models": [],
    "pipelines": [],
-    "quantizers.quantization_config": ["BitsAndBytesConfig"],
    "schedulers": [],
    "utils": [
        "OptionalDependencyNotAvailable",
@@ -77,13 +76,10 @@ except OptionalDependencyNotAvailable:
 else:
    _import_structure["models"].extend(
        [
-            "AllegroTransformer3DModel",
            "AsymmetricAutoencoderKL",
            "AuraFlowTransformer2DModel",
            "AutoencoderKL",
-            "AutoencoderKLAllegro",
            "AutoencoderKLCogVideoX",
-            "AutoencoderKLMochi",
            "AutoencoderKLTemporalDecoder",
            "AutoencoderOobleck",
            "AutoencoderTiny",
@@ -103,7 +99,6 @@ else:
            "Kandinsky3UNet",
            "LatteTransformer3DModel",
            "LuminaNextDiT2DModel",
-            "MochiTransformer3DModel",
            "ModelMixin",
            "MotionAdapter",
            "MultiAdapter",
@@ -129,6 +124,7 @@ else:
            "VQModel",
        ]
    )
+
    _import_structure["optimization"] = [
        "get_constant_schedule",
        "get_constant_schedule_with_warmup",
@@ -160,7 +156,6 @@ else:
            "StableDiffusionMixin",
        ]
    )
-    _import_structure["quantizers"] = ["DiffusersQuantizer"]
    _import_structure["schedulers"].extend(
        [
            "AmusedScheduler",
@@ -241,7 +236,6 @@ except OptionalDependencyNotAvailable:
 else:
    _import_structure["pipelines"].extend(
        [
-            "AllegroPipeline",
            "AltDiffusionImg2ImgPipeline",
            "AltDiffusionPipeline",
            "AmusedImg2ImgPipeline",
@@ -313,7 +307,6 @@ else:
            "LuminaText2ImgPipeline",
            "MarigoldDepthPipeline",
            "MarigoldNormalsPipeline",
-            "MochiPipeline",
            "MusicLDMPipeline",
            "PaintByExamplePipeline",
            "PIAPipeline",
@@ -487,7 +480,7 @@ except OptionalDependencyNotAvailable:


 else:
-    _import_structure["models.controlnets.controlnet_flax"] = ["FlaxControlNetModel"]
+    _import_structure["models.controlnet_flax"] = ["FlaxControlNetModel"]
    _import_structure["models.modeling_flax_utils"] = ["FlaxModelMixin"]
    _import_structure["models.unets.unet_2d_condition_flax"] = ["FlaxUNet2DConditionModel"]
    _import_structure["models.vae_flax"] = ["FlaxAutoencoderKL"]
@@ -545,7 +538,6 @@ else:

 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    from .configuration_utils import ConfigMixin
-    from .quantizers.quantization_config import BitsAndBytesConfig

    try:
        if not is_onnx_available():
@@ -562,13 +554,10 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .utils.dummy_pt_objects import *  # noqa F403
    else:
        from .models import (
-            AllegroTransformer3DModel,
            AsymmetricAutoencoderKL,
            AuraFlowTransformer2DModel,
            AutoencoderKL,
-            AutoencoderKLAllegro,
            AutoencoderKLCogVideoX,
-            AutoencoderKLMochi,
            AutoencoderKLTemporalDecoder,
            AutoencoderOobleck,
            AutoencoderTiny,
@@ -588,7 +577,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            Kandinsky3UNet,
            LatteTransformer3DModel,
            LuminaNextDiT2DModel,
-            MochiTransformer3DModel,
            ModelMixin,
            MotionAdapter,
            MultiAdapter,
@@ -644,7 +632,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            ScoreSdeVePipeline,
            StableDiffusionMixin,
        )
-        from .quantizers import DiffusersQuantizer
        from .schedulers import (
            AmusedScheduler,
            CMStochasticIterativeScheduler,
@@ -707,7 +694,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .utils.dummy_torch_and_transformers_objects import *  # noqa F403
    else:
        from .pipelines import (
-            AllegroPipeline,
            AltDiffusionImg2ImgPipeline,
            AltDiffusionPipeline,
            AmusedImg2ImgPipeline,
@@ -777,7 +763,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            LuminaText2ImgPipeline,
            MarigoldDepthPipeline,
            MarigoldNormalsPipeline,
-            MochiPipeline,
            MusicLDMPipeline,
            PaintByExamplePipeline,
            PIAPipeline,
@@ -914,7 +899,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    except OptionalDependencyNotAvailable:
        from .utils.dummy_flax_objects import *  # noqa F403
    else:
-        from .models.controlnets.controlnet_flax import FlaxControlNetModel
+        from .models.controlnet_flax import FlaxControlNetModel
        from .models.modeling_flax_utils import FlaxModelMixin
        from .models.unets.unet_2d_condition_flax import FlaxUNet2DConditionModel
        from .models.vae_flax import FlaxAutoencoderKL
@@ -97,17 +97,13 @@ class SDCFGCutoffCallback(PipelineCallback):

 class SDXLCFGCutoffCallback(PipelineCallback):
    """
-    Callback function for the base Stable Diffusion XL Pipelines. After certain number of steps (set by
-    `cutoff_step_ratio` or `cutoff_step_index`), this callback will disable the CFG.
+    Callback function for Stable Diffusion XL Pipelines. After certain number of steps (set by `cutoff_step_ratio` or
+    `cutoff_step_index`), this callback will disable the CFG.

    Note: This callback mutates the pipeline by changing the `_guidance_scale` attribute to 0.0 after the cutoff step.
    """

-    tensor_inputs = [
-        "prompt_embeds",
-        "add_text_embeds",
-        "add_time_ids",
-    ]
+    tensor_inputs = ["prompt_embeds", "add_text_embeds", "add_time_ids"]

    def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
        cutoff_step_ratio = self.config.cutoff_step_ratio
@@ -133,55 +129,6 @@ class SDXLCFGCutoffCallback(PipelineCallback):
            callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
            callback_kwargs[self.tensor_inputs[1]] = add_text_embeds
            callback_kwargs[self.tensor_inputs[2]] = add_time_ids
-
-        return callback_kwargs
-
-
-class SDXLControlnetCFGCutoffCallback(PipelineCallback):
-    """
-    Callback function for the Controlnet Stable Diffusion XL Pipelines. After certain number of steps (set by
-    `cutoff_step_ratio` or `cutoff_step_index`), this callback will disable the CFG.
-
-    Note: This callback mutates the pipeline by changing the `_guidance_scale` attribute to 0.0 after the cutoff step.
-    """
-
-    tensor_inputs = [
-        "prompt_embeds",
-        "add_text_embeds",
-        "add_time_ids",
-        "image",
-    ]
-
-    def callback_fn(self, pipeline, step_index, timestep, callback_kwargs) -> Dict[str, Any]:
-        cutoff_step_ratio = self.config.cutoff_step_ratio
-        cutoff_step_index = self.config.cutoff_step_index
-
-        # Use cutoff_step_index if it's not None, otherwise use cutoff_step_ratio
-        cutoff_step = (
-            cutoff_step_index if cutoff_step_index is not None else int(pipeline.num_timesteps * cutoff_step_ratio)
-        )
-
-        if step_index == cutoff_step:
-            prompt_embeds = callback_kwargs[self.tensor_inputs[0]]
-            prompt_embeds = prompt_embeds[-1:]  # "-1" denotes the embeddings for conditional text tokens.
-
-            add_text_embeds = callback_kwargs[self.tensor_inputs[1]]
-            add_text_embeds = add_text_embeds[-1:]  # "-1" denotes the embeddings for conditional pooled text tokens
-
-            add_time_ids = callback_kwargs[self.tensor_inputs[2]]
-            add_time_ids = add_time_ids[-1:]  # "-1" denotes the embeddings for conditional added time vector
-
-            # For Controlnet
-            image = callback_kwargs[self.tensor_inputs[3]]
-            image = image[-1:]
-
-            pipeline._guidance_scale = 0.0
-
-            callback_kwargs[self.tensor_inputs[0]] = prompt_embeds
-            callback_kwargs[self.tensor_inputs[1]] = add_text_embeds
-            callback_kwargs[self.tensor_inputs[2]] = add_time_ids
-            callback_kwargs[self.tensor_inputs[3]] = image
-
        return callback_kwargs


--- a/Show More
+++ b/Show More