fix

update dep table.
2025-05-06 10:59:19 +05:30 · 2025-05-06 10:57:38 +05:30
76 changed files with 147 additions and 9439 deletions
@@ -142,7 +142,6 @@ jobs:
        HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
        # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
        CUBLAS_WORKSPACE_CONFIG: :16:8
-        RUN_COMPILE: yes
      run: |
        python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
          -s -v -k "not Flax and not Onnx" \
@@ -526,60 +525,6 @@ jobs:
          pip install slack_sdk tabulate
          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY

-  run_nightly_pipeline_level_quantization_tests:
-    name: Torch quantization nightly tests
-    strategy:
-      fail-fast: false
-      max-parallel: 2
-    runs-on:
-      group: aws-g6e-xlarge-plus
-    container:
-      image: diffusers/diffusers-pytorch-cuda
-      options: --shm-size "20gb" --ipc host --gpus 0
-    steps:
-      - name: Checkout diffusers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-      - name: NVIDIA-SMI
-        run: nvidia-smi
-      - name: Install dependencies
-        run: |
-          python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
-          python -m uv pip install -e [quality,test]
-          python -m uv pip install -U bitsandbytes optimum_quanto
-          python -m uv pip install pytest-reportlog
-      - name: Environment
-        run: |
-          python utils/print_env.py
-      - name: Pipeline-level quantization tests on GPU
-        env:
-          HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
-          # https://pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms
-          CUBLAS_WORKSPACE_CONFIG: :16:8
-          BIG_GPU_MEMORY: 40
-        run: |
-          python -m pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-            --make-reports=tests_pipeline_level_quant_torch_cuda \
-            --report-log=tests_pipeline_level_quant_torch_cuda.log \
-            tests/quantization/test_pipeline_level_quantization.py
-      - name: Failure short reports
-        if: ${{ failure() }}
-        run: |
-          cat reports/tests_pipeline_level_quant_torch_cuda_stats.txt
-          cat reports/tests_pipeline_level_quant_torch_cuda_failures_short.txt
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v4
-        with:
-          name: torch_cuda_pipeline_level_quant_reports
-          path: reports
-      - name: Generate Report and Notify Channel
-        if: always()
-        run: |
-          pip install slack_sdk tabulate
-          python utils/log_reports.py >> $GITHUB_STEP_SUMMARY
-  
 # M1 runner currently not well supported
 # TODO: (Dhruv) add these back when we setup better testing for Apple Silicon
 #  run_nightly_tests_apple_m1:
@@ -295,8 +295,6 @@
        title: CogView4Transformer2DModel
      - local: api/models/consisid_transformer3d
        title: ConsisIDTransformer3DModel
-      - local: api/models/cosmos_transformer3d
-        title: CosmosTransformer3DModel
      - local: api/models/dit_transformer2d
        title: DiTTransformer2DModel
      - local: api/models/easyanimate_transformer3d
@@ -365,8 +363,6 @@
        title: AutoencoderKLAllegro
      - local: api/models/autoencoderkl_cogvideox
        title: AutoencoderKLCogVideoX
-      - local: api/models/autoencoderkl_cosmos
-        title: AutoencoderKLCosmos
      - local: api/models/autoencoder_kl_hunyuan_video
        title: AutoencoderKLHunyuanVideo
      - local: api/models/autoencoderkl_ltx_video
@@ -437,8 +433,6 @@
      title: ControlNet-XS with Stable Diffusion XL
    - local: api/pipelines/controlnet_union
      title: ControlNetUnion
-    - local: api/pipelines/cosmos
-      title: Cosmos
    - local: api/pipelines/dance_diffusion
      title: Dance Diffusion
    - local: api/pipelines/ddim
@@ -1,40 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# AutoencoderKLCosmos
-
-[Cosmos Tokenizers](https://github.com/NVIDIA/Cosmos-Tokenizer).
-
-Supported models:
- [nvidia/Cosmos-1.0-Tokenizer-CV8x8x8](https://huggingface.co/nvidia/Cosmos-1.0-Tokenizer-CV8x8x8)
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import AutoencoderKLCosmos
-
-vae = AutoencoderKLCosmos.from_pretrained("nvidia/Cosmos-1.0-Tokenizer-CV8x8x8", subfolder="vae")
-```
-
-## AutoencoderKLCosmos
-
-[[autodoc]] AutoencoderKLCosmos
-    - decode
-    - encode
-    - all
-
-## AutoencoderKLOutput
-
-[[autodoc]] models.autoencoders.autoencoder_kl.AutoencoderKLOutput
-
-## DecoderOutput
-
-[[autodoc]] models.autoencoders.vae.DecoderOutput
@@ -1,30 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License. -->
-
-# CosmosTransformer3DModel
-
-A Diffusion Transformer model for 3D video-like data was introduced in [Cosmos World Foundation Model Platform for Physical AI](https://huggingface.co/papers/2501.03575) by NVIDIA.
-
-The model can be loaded with the following code snippet.
-
-```python
-from diffusers import CosmosTransformer3DModel
-
-transformer = CosmosTransformer3DModel.from_pretrained("nvidia/Cosmos-1.0-Diffusion-7B-Text2World", subfolder="transformer", torch_dtype=torch.bfloat16)
-```
-
-## CosmosTransformer3DModel
-
-[[autodoc]] CosmosTransformer3DModel
-
-## Transformer2DModelOutput
-
-[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
@@ -1,41 +0,0 @@
-<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License. -->
-
-# Cosmos
-
-[Cosmos World Foundation Model Platform for Physical AI](https://huggingface.co/papers/2501.03575) by NVIDIA.
-
-*Physical AI needs to be trained digitally first. It needs a digital twin of itself, the policy model, and a digital twin of the world, the world model. In this paper, we present the Cosmos World Foundation Model Platform to help developers build customized world models for their Physical AI setups. We position a world foundation model as a general-purpose world model that can be fine-tuned into customized world models for downstream applications. Our platform covers a video curation pipeline, pre-trained world foundation models, examples of post-training of pre-trained world foundation models, and video tokenizers. To help Physical AI builders solve the most critical problems of our society, we make our platform open-source and our models open-weight with permissive licenses available via https://github.com/NVIDIA/Cosmos.*
-
-<Tip>
-
-Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers) to learn how to explore the tradeoff between scheduler speed and quality, and see the [reuse components across pipelines](../../using-diffusers/loading#reuse-a-pipeline) section to learn how to efficiently load the same components into multiple pipelines.
-
-</Tip>
-
-## CosmosTextToWorldPipeline
-
-[[autodoc]] CosmosTextToWorldPipeline
-  - all
-  - __call__
-
-## CosmosVideoToWorldPipeline
-
-[[autodoc]] CosmosVideoToWorldPipeline
-  - all
-  - __call__
-
-## CosmosPipelineOutput
-
-[[autodoc]] pipelines.cosmos.pipeline_output.CosmosPipelineOutput
@@ -52,7 +52,6 @@ The following models are available for the image-to-video pipeline:
 | [`Skywork/SkyReels-V1-Hunyuan-I2V`](https://huggingface.co/Skywork/SkyReels-V1-Hunyuan-I2V) | Skywork's custom finetune of HunyuanVideo (de-distilled). Performs best with `97x544x960` resolution. Performs best at `97x544x960` resolution, `guidance_scale=1.0`, `true_cfg_scale=6.0` and a negative prompt. |
 | [`hunyuanvideo-community/HunyuanVideo-I2V-33ch`](https://huggingface.co/hunyuanvideo-community/HunyuanVideo-I2V) | Tecent's official HunyuanVideo 33-channel I2V model. Performs best at resolutions of 480, 720, 960, 1280. A higher `shift` value when initializing the scheduler is recommended (good values are between 7 and 20). |
 | [`hunyuanvideo-community/HunyuanVideo-I2V`](https://huggingface.co/hunyuanvideo-community/HunyuanVideo-I2V) | Tecent's official HunyuanVideo 16-channel I2V model. Performs best at resolutions of 480, 720, 960, 1280. A higher `shift` value when initializing the scheduler is recommended (good values are between 7 and 20) |
- [`lllyasviel/FramePackI2V_HY`](https://huggingface.co/lllyasviel/FramePackI2V_HY) | lllyasviel's paper introducing a new technique for long-context video generation called [Framepack](https://arxiv.org/abs/2504.12626). |

 ## Quantization

@@ -13,7 +13,9 @@ specific language governing permissions and limitations under the License.

 # Quantization

-Quantization techniques reduce memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference.
+Quantization techniques reduce memory and computational costs by representing weights and activations with lower-precision data types like 8-bit integers (int8). This enables loading larger models you normally wouldn't be able to fit into memory, and speeding up inference. Diffusers supports 8-bit and 4-bit quantization with [bitsandbytes](https://huggingface.co/docs/bitsandbytes/en/index).
+
+Quantization techniques that aren't supported in Transformers can be added with the [`DiffusersQuantizer`] class.

 <Tip>

@@ -21,9 +23,6 @@ Learn how to quantize models in the [Quantization](../quantization/overview) gui

 </Tip>

-## PipelineQuantizationConfig
-
-[[autodoc]] quantizers.PipelineQuantizationConfig

 ## BitsAndBytesConfig

@@ -48,7 +48,7 @@ For Ada and higher-series GPUs. we recommend changing `torch_dtype` to `torch.bf
 ```py
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
 from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
-import torch
+
 from diffusers import AutoModel
 from transformers import T5EncoderModel

@@ -88,8 +88,6 @@ Setting `device_map="auto"` automatically fills all available space on the GPU(s
 CPU, and finally, the hard drive (the absolute slowest option) if there is still not enough memory.

 ```py
-from diffusers import FluxPipeline
-
 pipe = FluxPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    transformer=transformer_8bit,
@@ -134,7 +132,7 @@ For Ada and higher-series GPUs. we recommend changing `torch_dtype` to `torch.bf
 ```py
 from diffusers import BitsAndBytesConfig as DiffusersBitsAndBytesConfig
 from transformers import BitsAndBytesConfig as TransformersBitsAndBytesConfig
-import torch
+
 from diffusers import AutoModel
 from transformers import T5EncoderModel

@@ -173,8 +171,6 @@ Let's generate an image using our quantized models.
 Setting `device_map="auto"` automatically fills all available space on the GPU(s) first, then the CPU, and finally, the hard drive (the absolute slowest option) if there is still not enough memory.

 ```py
-from diffusers import FluxPipeline
-
 pipe = FluxPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    transformer=transformer_4bit,
@@ -218,8 +214,6 @@ Check your memory footprint with the `get_memory_footprint` method:
 print(model.get_memory_footprint())
 ```

-Note that this only tells you the memory footprint of the model params and does _not_ estimate the inference memory requirements.
-
 Quantized models can be loaded from the [`~ModelMixin.from_pretrained`] method without needing to specify the `quantization_config` parameters:

 ```py
@@ -419,4 +413,4 @@ transformer_4bit.dequantize()
 ## Resources

 * [End-to-end notebook showing Flux.1 Dev inference in a free-tier Colab](https://gist.github.com/sayakpaul/c76bd845b48759e11687ac550b99d8b4)
-* [Training](https://github.com/huggingface/diffusers/blob/8c661ea586bf11cb2440da740dd3c4cf84679b85/examples/dreambooth/README_hidream.md#using-quantization)
+* [Training](https://gist.github.com/sayakpaul/05afd428bc089b47af7c016e42004527)
@@ -39,90 +39,3 @@ Diffusers currently supports the following quantization methods.
 - [Quanto](./quanto.md)

 [This resource](https://huggingface.co/docs/transformers/main/en/quantization/overview#when-to-use-what) provides a good overview of the pros and cons of different quantization techniques.
-
-## Pipeline-level quantization
-
-Diffusers allows users to directly initialize pipelines from checkpoints that may contain quantized models ([example](https://huggingface.co/hf-internal-testing/flux.1-dev-nf4-pkg)). However, users may want to apply
-quantization on-the-fly when initializing a pipeline from a pre-trained and non-quantized checkpoint. You can
-do this with [`~quantizers.PipelineQuantizationConfig`].
-
-Start by defining a `PipelineQuantizationConfig`:
-
-```py
-import torch
-from diffusers import DiffusionPipeline
-from diffusers.quantizers.quantization_config import QuantoConfig
-from diffusers.quantizers import PipelineQuantizationConfig
-from transformers import BitsAndBytesConfig
-
-pipeline_quant_config = PipelineQuantizationConfig(
-    quant_mapping={
-        "transformer": QuantoConfig(weights_dtype="int8"),
-        "text_encoder_2": BitsAndBytesConfig(
-            load_in_4bit=True, compute_dtype=torch.bfloat16
-        ),
-    }
-)
-```
-
-Then pass it to [`~DiffusionPipeline.from_pretrained`] and run inference:
-
-```py
-pipe = DiffusionPipeline.from_pretrained(
-    "black-forest-labs/FLUX.1-dev",
-    quantization_config=pipeline_quant_config,
-    torch_dtype=torch.bfloat16,
-).to("cuda")
-
-image = pipe("photo of a cute dog").images[0]
-```
-
-This method allows for more granular control over the quantization specifications of individual 
-model-level components of a pipeline. It also allows for different quantization backends for
-different components. In the above example, you used a combination of Quanto and BitsandBytes. However,
-one caveat of this method is that users need to know which components come from `transformers` to be able
-to import the right quantization config class.
-
-The other method is simpler in terms of experience but is
-less-flexible. Start by defining a `PipelineQuantizationConfig` but in a different way:
-
-```py
-pipeline_quant_config = PipelineQuantizationConfig(
-    quant_backend="bitsandbytes_4bit",
-    quant_kwargs={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16},
-    components_to_quantize=["transformer", "text_encoder_2"],
-)
-```
-
-This `pipeline_quant_config` can now be passed to [`~DiffusionPipeline.from_pretrained`] similar to the above example.
-
-In this case, `quant_kwargs` will be used to initialize the quantization specifications
-of the respective quantization configuration class of `quant_backend`. `components_to_quantize`
-is used to denote the components that will be quantized. For most pipelines, you would want to
-keep `transformer` in the list as that is often the most compute and memory intensive.
-
-The config below will work for most diffusion pipelines that have a `transformer` component present.
-In most case, you will want to quantize the `transformer` component as that is often the most compute-
-intensive part of a diffusion pipeline.
-
-```py
-pipeline_quant_config = PipelineQuantizationConfig(
-    quant_backend="bitsandbytes_4bit",
-    quant_kwargs={"load_in_4bit": True, "bnb_4bit_quant_type": "nf4", "bnb_4bit_compute_dtype": torch.bfloat16},
-    components_to_quantize=["transformer"],
-)
-```
-
-Below is a list of the supported quantization backends available in both `diffusers` and `transformers`:
-
-* `bitsandbytes_4bit` 
-* `bitsandbytes_8bit`
-* `gguf`
-* `quanto`
-* `torchao`
-
-
-Diffusion pipelines can have multiple text encoders. [`FluxPipeline`] has two, for example. It's
-recommended to quantize the text encoders that are memory-intensive. Some examples include T5,
-Llama, Gemma, etc. In the above example, you quantized the T5 model of [`FluxPipeline`] through
-`text_encoder_2` while keeping the CLIP model intact (accessible through `text_encoder`). 
@@ -430,9 +430,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--with_prior_preservation",
        default=False,
@@ -1557,7 +1554,6 @@ def main(args):
    transformer_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        init_lora_weights="gaussian",
        target_modules=target_modules,
    )
@@ -1566,7 +1562,6 @@ def main(args):
        text_lora_config = LoraConfig(
            r=args.rank,
            lora_alpha=args.rank,
-            lora_dropout=args.lora_dropout,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
        )
@@ -658,8 +658,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--use_dora",
        action="store_true",
@@ -1250,7 +1248,6 @@ def main(args):
    unet_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        use_dora=args.use_dora,
        init_lora_weights="gaussian",
        target_modules=["to_k", "to_q", "to_v", "to_out.0"],
@@ -1263,7 +1260,6 @@ def main(args):
        text_lora_config = LoraConfig(
            r=args.rank,
            lora_alpha=args.rank,
-            lora_dropout=args.lora_dropout,
            use_dora=args.use_dora,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
@@ -767,9 +767,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--use_dora",
        action="store_true",
@@ -1561,7 +1558,6 @@ def main(args):
        r=args.rank,
        use_dora=args.use_dora,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        init_lora_weights="gaussian",
        target_modules=target_modules,
    )
@@ -1574,7 +1570,6 @@ def main(args):
            r=args.rank,
            use_dora=args.use_dora,
            lora_alpha=args.rank,
-            lora_dropout=args.lora_dropout,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
        )
@@ -524,9 +524,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--image_interpolation_mode",
        type=str,
@@ -935,7 +932,6 @@ def main(args):
    unet_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        init_lora_weights="gaussian",
        target_modules=["to_k", "to_q", "to_v", "to_out.0", "add_k_proj", "add_v_proj"],
    )
@@ -946,7 +942,6 @@ def main(args):
        text_lora_config = LoraConfig(
            r=args.rank,
            lora_alpha=args.rank,
-            lora_dropout=args.lora_dropout,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
        )
@@ -358,9 +358,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--with_prior_preservation",
        default=False,
@@ -1239,7 +1236,6 @@ def main(args):
    transformer_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        init_lora_weights="gaussian",
        target_modules=target_modules,
    )
@@ -1248,7 +1244,6 @@ def main(args):
        text_lora_config = LoraConfig(
            r=args.rank,
            lora_alpha=args.rank,
-            lora_dropout=args.lora_dropout,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
        )
@@ -417,9 +417,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--with_prior_preservation",
        default=False,
@@ -1164,7 +1161,6 @@ def main(args):
    transformer_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        init_lora_weights="gaussian",
        target_modules=target_modules,
    )
@@ -328,9 +328,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--with_prior_preservation",
        default=False,
@@ -1026,7 +1023,6 @@ def main(args):
    transformer_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        init_lora_weights="gaussian",
        target_modules=target_modules,
    )
@@ -323,9 +323,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--with_prior_preservation",
        default=False,
@@ -1024,7 +1021,6 @@ def main(args):
    transformer_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        init_lora_weights="gaussian",
        target_modules=target_modules,
    )
@@ -367,9 +367,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--with_prior_preservation",
        default=False,
@@ -1267,7 +1264,6 @@ def main(args):
    transformer_lora_config = LoraConfig(
        r=args.rank,
        lora_alpha=args.rank,
-        lora_dropout=args.lora_dropout,
        init_lora_weights="gaussian",
        target_modules=target_modules,
    )
@@ -1277,7 +1273,6 @@ def main(args):
        text_lora_config = LoraConfig(
            r=args.rank,
            lora_alpha=args.rank,
-            lora_dropout=args.lora_dropout,
            init_lora_weights="gaussian",
            target_modules=["q_proj", "k_proj", "v_proj", "out_proj"],
        )
@@ -659,9 +659,6 @@ def parse_args(input_args=None):
        default=4,
        help=("The dimension of the LoRA update matrices."),
    )
-
-    parser.add_argument("--lora_dropout", type=float, default=0.0, help="Dropout probability for LoRA layers")
-
    parser.add_argument(
        "--use_dora",
        action="store_true",
@@ -1202,11 +1199,10 @@ def main(args):
            text_encoder_one.gradient_checkpointing_enable()
            text_encoder_two.gradient_checkpointing_enable()

-    def get_lora_config(rank, dropout, use_dora, target_modules):
+    def get_lora_config(rank, use_dora, target_modules):
        base_config = {
            "r": rank,
            "lora_alpha": rank,
-            "lora_dropout": dropout,
            "init_lora_weights": "gaussian",
            "target_modules": target_modules,
        }
@@ -1222,24 +1218,14 @@ def main(args):

    # now we will add new LoRA weights to the attention layers
    unet_target_modules = ["to_k", "to_q", "to_v", "to_out.0"]
-    unet_lora_config = get_lora_config(
-        rank=args.rank,
-        dropout=args.lora_dropout,
-        use_dora=args.use_dora,
-        target_modules=unet_target_modules,
-    )
+    unet_lora_config = get_lora_config(rank=args.rank, use_dora=args.use_dora, target_modules=unet_target_modules)
    unet.add_adapter(unet_lora_config)

    # The text encoder comes from 🤗 transformers, so we cannot directly modify it.
    # So, instead, we monkey-patch the forward calls of its attention-blocks.
    if args.train_text_encoder:
        text_target_modules = ["q_proj", "k_proj", "v_proj", "out_proj"]
-        text_lora_config = get_lora_config(
-            rank=args.rank,
-            dropout=args.lora_dropout,
-            use_dora=args.use_dora,
-            target_modules=text_target_modules,
-        )
+        text_lora_config = get_lora_config(rank=args.rank, use_dora=args.use_dora, target_modules=text_target_modules)
        text_encoder_one.add_adapter(text_lora_config)
        text_encoder_two.add_adapter(text_lora_config)

@@ -1,95 +0,0 @@
-# Training SANA Sprint Diffuser
-
-This README explains how to use the provided bash script commands to download a pre-trained teacher diffuser model and train it on a specific dataset, following the [SANA Sprint methodology](https://arxiv.org/abs/2503.09641).
-
-
-## Setup
-
-### 1. Define the local paths
-
-Set a variable for your desired output directory. This directory will store the downloaded model and the training checkpoints/results.
-
-```bash
-your_local_path='output' # Or any other path you prefer
-mkdir -p $your_local_path # Create the directory if it doesn't exist
-```
-
-### 2. Download the pre-trained model
-
-Download the SANA Sprint teacher model from Hugging Face Hub. The script uses the 1.6B parameter model.
-
-```bash
-huggingface-cli download Efficient-Large-Model/SANA_Sprint_1.6B_1024px_teacher_diffusers --local-dir $your_local_path/SANA_Sprint_1.6B_1024px_teacher_diffusers
-```
-
-*(Optional: You can also download the 0.6B model by replacing the model name: `Efficient-Large-Model/Sana_Sprint_0.6B_1024px_teacher_diffusers`)*
-
-### 3. Acquire the dataset shards
-
-The training script in this example uses specific `.parquet` shards from a randomly selected `brivangl/midjourney-v6-llava` dataset instead of downloading the entire dataset automatically via `dataset_name`.
-
-The script specifically uses these three files:
-*   `data/train_000.parquet`
-*   `data/train_001.parquet`
-*   `data/train_002.parquet`
-
-
-
-You can either:
-
-Let the script download the dataset automatically during first run
-
-Or download it manually
-
-**Note:** The full `brivangl/midjourney-v6-llava` dataset is much larger and contains many more shards. This script example explicitly trains *only* on the three specified shards.
-
-## Usage
-
-Once the model is downloaded, you can run the training script.
-
-```bash
-
-your_local_path='output' # Ensure this variable is set
-
-python train_sana_sprint_diffusers.py \
-    --pretrained_model_name_or_path=$your_local_path/SANA_Sprint_1.6B_1024px_teacher_diffusers \
-    --output_dir=$your_local_path \
-    --mixed_precision=bf16 \
-    --resolution=1024 \
-    --learning_rate=1e-6 \
-    --max_train_steps=30000 \
-    --dataloader_num_workers=8 \
-    --dataset_name='brivangl/midjourney-v6-llava' \
-    --file_path data/train_000.parquet data/train_001.parquet data/train_002.parquet \
-    --checkpointing_steps=500 --checkpoints_total_limit=10 \
-    --train_batch_size=1 \
-    --gradient_accumulation_steps=1 \
-    --seed=453645634 \
-    --train_largest_timestep \
-    --misaligned_pairs_D \
-    --gradient_checkpointing \
-    --resume_from_checkpoint="latest" \
-```
-
-### Explanation of parameters
-
-*   `--pretrained_model_name_or_path`: Path to the downloaded pre-trained model directory.
-*   `--output_dir`: Directory where training logs, checkpoints, and the final model will be saved.
-*   `--mixed_precision`: Use BF16 mixed precision for training, which can save memory and speed up training on compatible hardware.
-*   `--resolution`: The image resolution used for training (1024x1024).
-*   `--learning_rate`: The learning rate for the optimizer.
-*   `--max_train_steps`: The total number of training steps to perform.
-*   `--dataloader_num_workers`: Number of worker processes for loading data. Increase for faster data loading if your CPU and disk can handle it.
-*   `--dataset_name`: The name of the dataset on Hugging Face Hub (`brivangl/midjourney-v6-llava`).
-*   `--file_path`: **Specifies the local paths to the dataset shards to be used for training.** In this case, `data/train_000.parquet`, `data/train_001.parquet`, and `data/train_002.parquet`.
-*   `--checkpointing_steps`: Save a training checkpoint every X steps.
-*   `--checkpoints_total_limit`: Maximum number of checkpoints to keep. Older checkpoints will be deleted.
-*   `--train_batch_size`: The batch size per GPU.
-*   `--gradient_accumulation_steps`: Number of steps to accumulate gradients before performing an optimizer step.
-*   `--seed`: Random seed for reproducibility.
-*   `--train_largest_timestep`: A specific training strategy focusing on larger timesteps.
-*   `--misaligned_pairs_D`: Another specific training strategy to add misaligned image-text pairs as fake data for GAN.
-*   `--gradient_checkpointing`: Enable gradient checkpointing to save GPU memory.
-*   `--resume_from_checkpoint`: Allows resuming training from the latest saved checkpoint in the `--output_dir`.
-
-
@@ -1,26 +0,0 @@
-your_local_path='output'
-
-huggingface-cli download Efficient-Large-Model/SANA_Sprint_1.6B_1024px_teacher_diffusers  --local-dir $your_local_path/SANA_Sprint_1.6B_1024px_teacher_diffusers
-
-# or Sana_Sprint_0.6B_1024px_teacher_diffusers
-
-python train_sana_sprint_diffusers.py \
-    --pretrained_model_name_or_path=$your_local_path/SANA_Sprint_1.6B_1024px_teacher_diffusers \
-    --output_dir=$your_local_path \
-    --mixed_precision=bf16 \
-    --resolution=1024 \
-    --learning_rate=1e-6 \
-    --max_train_steps=30000 \
-    --dataloader_num_workers=8 \
-    --dataset_name='brivangl/midjourney-v6-llava' \
-    --file_path data/train_000.parquet data/train_001.parquet data/train_002.parquet \
-    --checkpointing_steps=500 --checkpoints_total_limit=10 \
-    --train_batch_size=1 \
-    --gradient_accumulation_steps=1 \
-    --seed=453645634 \
-    --train_largest_timestep \
-    --misaligned_pairs_D \
-    --gradient_checkpointing \
-    --resume_from_checkpoint="latest" \
-
-
@@ -1,352 +0,0 @@
-import argparse
-import pathlib
-from typing import Any, Dict
-
-import torch
-from accelerate import init_empty_weights
-from huggingface_hub import snapshot_download
-from transformers import T5EncoderModel, T5TokenizerFast
-
-from diffusers import AutoencoderKLCosmos, CosmosTextToWorldPipeline, CosmosTransformer3DModel, EDMEulerScheduler
-
-
-def remove_keys_(key: str, state_dict: Dict[str, Any]):
-    state_dict.pop(key)
-
-
-def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
-    state_dict[new_key] = state_dict.pop(old_key)
-
-
-def rename_transformer_blocks_(key: str, state_dict: Dict[str, Any]):
-    block_index = int(key.split(".")[1].removeprefix("block"))
-    new_key = key
-
-    old_prefix = f"blocks.block{block_index}"
-    new_prefix = f"transformer_blocks.{block_index}"
-    new_key = new_prefix + new_key.removeprefix(old_prefix)
-
-    state_dict[new_key] = state_dict.pop(key)
-
-
-TRANSFORMER_KEYS_RENAME_DICT = {
-    "t_embedder.1": "time_embed.t_embedder",
-    "affline_norm": "time_embed.norm",
-    ".blocks.0.block.attn": ".attn1",
-    ".blocks.1.block.attn": ".attn2",
-    ".blocks.2.block": ".ff",
-    ".blocks.0.adaLN_modulation.1": ".norm1.linear_1",
-    ".blocks.0.adaLN_modulation.2": ".norm1.linear_2",
-    ".blocks.1.adaLN_modulation.1": ".norm2.linear_1",
-    ".blocks.1.adaLN_modulation.2": ".norm2.linear_2",
-    ".blocks.2.adaLN_modulation.1": ".norm3.linear_1",
-    ".blocks.2.adaLN_modulation.2": ".norm3.linear_2",
-    "to_q.0": "to_q",
-    "to_q.1": "norm_q",
-    "to_k.0": "to_k",
-    "to_k.1": "norm_k",
-    "to_v.0": "to_v",
-    "layer1": "net.0.proj",
-    "layer2": "net.2",
-    "proj.1": "proj",
-    "x_embedder": "patch_embed",
-    "extra_pos_embedder": "learnable_pos_embed",
-    "final_layer.adaLN_modulation.1": "norm_out.linear_1",
-    "final_layer.adaLN_modulation.2": "norm_out.linear_2",
-    "final_layer.linear": "proj_out",
-}
-
-TRANSFORMER_SPECIAL_KEYS_REMAP = {
-    "blocks.block": rename_transformer_blocks_,
-    "logvar.0.freqs": remove_keys_,
-    "logvar.0.phases": remove_keys_,
-    "logvar.1.weight": remove_keys_,
-    "pos_embedder.seq": remove_keys_,
-}
-
-TRANSFORMER_CONFIGS = {
-    "Cosmos-1.0-Diffusion-7B-Text2World": {
-        "in_channels": 16,
-        "out_channels": 16,
-        "num_attention_heads": 32,
-        "attention_head_dim": 128,
-        "num_layers": 28,
-        "mlp_ratio": 4.0,
-        "text_embed_dim": 1024,
-        "adaln_lora_dim": 256,
-        "max_size": (128, 240, 240),
-        "patch_size": (1, 2, 2),
-        "rope_scale": (2.0, 1.0, 1.0),
-        "concat_padding_mask": True,
-        "extra_pos_embed_type": "learnable",
-    },
-    "Cosmos-1.0-Diffusion-7B-Video2World": {
-        "in_channels": 16 + 1,
-        "out_channels": 16,
-        "num_attention_heads": 32,
-        "attention_head_dim": 128,
-        "num_layers": 28,
-        "mlp_ratio": 4.0,
-        "text_embed_dim": 1024,
-        "adaln_lora_dim": 256,
-        "max_size": (128, 240, 240),
-        "patch_size": (1, 2, 2),
-        "rope_scale": (2.0, 1.0, 1.0),
-        "concat_padding_mask": True,
-        "extra_pos_embed_type": "learnable",
-    },
-    "Cosmos-1.0-Diffusion-14B-Text2World": {
-        "in_channels": 16,
-        "out_channels": 16,
-        "num_attention_heads": 40,
-        "attention_head_dim": 128,
-        "num_layers": 36,
-        "mlp_ratio": 4.0,
-        "text_embed_dim": 1024,
-        "adaln_lora_dim": 256,
-        "max_size": (128, 240, 240),
-        "patch_size": (1, 2, 2),
-        "rope_scale": (2.0, 2.0, 2.0),
-        "concat_padding_mask": True,
-        "extra_pos_embed_type": "learnable",
-    },
-    "Cosmos-1.0-Diffusion-14B-Video2World": {
-        "in_channels": 16 + 1,
-        "out_channels": 16,
-        "num_attention_heads": 40,
-        "attention_head_dim": 128,
-        "num_layers": 36,
-        "mlp_ratio": 4.0,
-        "text_embed_dim": 1024,
-        "adaln_lora_dim": 256,
-        "max_size": (128, 240, 240),
-        "patch_size": (1, 2, 2),
-        "rope_scale": (2.0, 2.0, 2.0),
-        "concat_padding_mask": True,
-        "extra_pos_embed_type": "learnable",
-    },
-}
-
-VAE_KEYS_RENAME_DICT = {
-    "down.0": "down_blocks.0",
-    "down.1": "down_blocks.1",
-    "down.2": "down_blocks.2",
-    "up.0": "up_blocks.2",
-    "up.1": "up_blocks.1",
-    "up.2": "up_blocks.0",
-    ".block.": ".resnets.",
-    "downsample": "downsamplers.0",
-    "upsample": "upsamplers.0",
-    "mid.block_1": "mid_block.resnets.0",
-    "mid.attn_1.0": "mid_block.attentions.0",
-    "mid.attn_1.1": "mid_block.temp_attentions.0",
-    "mid.block_2": "mid_block.resnets.1",
-    ".q.conv3d": ".to_q",
-    ".k.conv3d": ".to_k",
-    ".v.conv3d": ".to_v",
-    ".proj_out.conv3d": ".to_out.0",
-    ".0.conv3d": ".conv_s",
-    ".1.conv3d": ".conv_t",
-    "conv1.conv3d": "conv1",
-    "conv2.conv3d": "conv2",
-    "conv3.conv3d": "conv3",
-    "nin_shortcut.conv3d": "conv_shortcut",
-    "quant_conv.conv3d": "quant_conv",
-    "post_quant_conv.conv3d": "post_quant_conv",
-}
-
-VAE_SPECIAL_KEYS_REMAP = {
-    "wavelets": remove_keys_,
-    "_arange": remove_keys_,
-    "patch_size_buffer": remove_keys_,
-}
-
-VAE_CONFIGS = {
-    "CV8x8x8-0.1": {
-        "name": "nvidia/Cosmos-0.1-Tokenizer-CV8x8x8",
-        "diffusers_config": {
-            "in_channels": 3,
-            "out_channels": 3,
-            "latent_channels": 16,
-            "encoder_block_out_channels": (128, 256, 512, 512),
-            "decode_block_out_channels": (256, 512, 512, 512),
-            "attention_resolutions": (32,),
-            "resolution": 1024,
-            "num_layers": 2,
-            "patch_size": 4,
-            "patch_type": "haar",
-            "scaling_factor": 1.0,
-            "spatial_compression_ratio": 8,
-            "temporal_compression_ratio": 8,
-            "latents_mean": None,
-            "latents_std": None,
-        },
-    },
-    "CV8x8x8-1.0": {
-        "name": "nvidia/Cosmos-1.0-Tokenizer-CV8x8x8",
-        "diffusers_config": {
-            "in_channels": 3,
-            "out_channels": 3,
-            "latent_channels": 16,
-            "encoder_block_out_channels": (128, 256, 512, 512),
-            "decode_block_out_channels": (256, 512, 512, 512),
-            "attention_resolutions": (32,),
-            "resolution": 1024,
-            "num_layers": 2,
-            "patch_size": 4,
-            "patch_type": "haar",
-            "scaling_factor": 1.0,
-            "spatial_compression_ratio": 8,
-            "temporal_compression_ratio": 8,
-            "latents_mean": None,
-            "latents_std": None,
-        },
-    },
-}
-
-
-def get_state_dict(saved_dict: Dict[str, Any]) -> Dict[str, Any]:
-    state_dict = saved_dict
-    if "model" in saved_dict.keys():
-        state_dict = state_dict["model"]
-    if "module" in saved_dict.keys():
-        state_dict = state_dict["module"]
-    if "state_dict" in saved_dict.keys():
-        state_dict = state_dict["state_dict"]
-    return state_dict
-
-
-def convert_transformer(transformer_type: str, ckpt_path: str):
-    PREFIX_KEY = "net."
-    original_state_dict = get_state_dict(torch.load(ckpt_path, map_location="cpu", weights_only=True))
-
-    with init_empty_weights():
-        config = TRANSFORMER_CONFIGS[transformer_type]
-        transformer = CosmosTransformer3DModel(**config)
-
-    for key in list(original_state_dict.keys()):
-        new_key = key[:]
-        if new_key.startswith(PREFIX_KEY):
-            new_key = new_key.removeprefix(PREFIX_KEY)
-        for replace_key, rename_key in TRANSFORMER_KEYS_RENAME_DICT.items():
-            new_key = new_key.replace(replace_key, rename_key)
-        update_state_dict_(original_state_dict, key, new_key)
-
-    for key in list(original_state_dict.keys()):
-        for special_key, handler_fn_inplace in TRANSFORMER_SPECIAL_KEYS_REMAP.items():
-            if special_key not in key:
-                continue
-            handler_fn_inplace(key, original_state_dict)
-
-    transformer.load_state_dict(original_state_dict, strict=True, assign=True)
-    return transformer
-
-
-def convert_vae(vae_type: str):
-    model_name = VAE_CONFIGS[vae_type]["name"]
-    snapshot_directory = snapshot_download(model_name, repo_type="model")
-    directory = pathlib.Path(snapshot_directory)
-
-    autoencoder_file = directory / "autoencoder.jit"
-    mean_std_file = directory / "mean_std.pt"
-
-    original_state_dict = torch.jit.load(autoencoder_file.as_posix()).state_dict()
-    if mean_std_file.exists():
-        mean_std = torch.load(mean_std_file, map_location="cpu", weights_only=True)
-    else:
-        mean_std = (None, None)
-
-    config = VAE_CONFIGS[vae_type]["diffusers_config"]
-    config.update(
-        {
-            "latents_mean": mean_std[0].detach().cpu().numpy().tolist(),
-            "latents_std": mean_std[1].detach().cpu().numpy().tolist(),
-        }
-    )
-    vae = AutoencoderKLCosmos(**config)
-
-    for key in list(original_state_dict.keys()):
-        new_key = key[:]
-        for replace_key, rename_key in VAE_KEYS_RENAME_DICT.items():
-            new_key = new_key.replace(replace_key, rename_key)
-        update_state_dict_(original_state_dict, key, new_key)
-
-    for key in list(original_state_dict.keys()):
-        for special_key, handler_fn_inplace in VAE_SPECIAL_KEYS_REMAP.items():
-            if special_key not in key:
-                continue
-            handler_fn_inplace(key, original_state_dict)
-
-    vae.load_state_dict(original_state_dict, strict=True, assign=True)
-    return vae
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--transformer_type", type=str, default=None, choices=list(TRANSFORMER_CONFIGS.keys()))
-    parser.add_argument(
-        "--transformer_ckpt_path", type=str, default=None, help="Path to original transformer checkpoint"
-    )
-    parser.add_argument("--vae_type", type=str, default=None, choices=list(VAE_CONFIGS.keys()), help="Type of VAE")
-    parser.add_argument("--text_encoder_path", type=str, default="google-t5/t5-11b")
-    parser.add_argument("--tokenizer_path", type=str, default="google-t5/t5-11b")
-    parser.add_argument("--save_pipeline", action="store_true")
-    parser.add_argument("--output_path", type=str, required=True, help="Path where converted model should be saved")
-    parser.add_argument("--dtype", default="bf16", help="Torch dtype to save the transformer in.")
-    return parser.parse_args()
-
-
-DTYPE_MAPPING = {
-    "fp32": torch.float32,
-    "fp16": torch.float16,
-    "bf16": torch.bfloat16,
-}
-
-
-if __name__ == "__main__":
-    args = get_args()
-
-    transformer = None
-    dtype = DTYPE_MAPPING[args.dtype]
-
-    if args.save_pipeline:
-        assert args.transformer_ckpt_path is not None
-        assert args.vae_type is not None
-        assert args.text_encoder_path is not None
-        assert args.tokenizer_path is not None
-
-    if args.transformer_ckpt_path is not None:
-        transformer = convert_transformer(args.transformer_type, args.transformer_ckpt_path)
-        transformer = transformer.to(dtype=dtype)
-        if not args.save_pipeline:
-            transformer.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
-
-    if args.vae_type is not None:
-        vae = convert_vae(args.vae_type)
-        if not args.save_pipeline:
-            vae.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
-
-    if args.save_pipeline:
-        text_encoder = T5EncoderModel.from_pretrained(args.text_encoder_path, torch_dtype=dtype)
-        tokenizer = T5TokenizerFast.from_pretrained(args.tokenizer_path)
-        # The original code initializes EDM config with sigma_min=0.0002, but does not make use of it anywhere directly.
-        # So, the sigma_min values that is used is the default value of 0.002.
-        scheduler = EDMEulerScheduler(
-            sigma_min=0.002,
-            sigma_max=80,
-            sigma_data=0.5,
-            sigma_schedule="karras",
-            num_train_timesteps=1000,
-            prediction_type="epsilon",
-            rho=7.0,
-            final_sigmas_type="sigma_min",
-        )
-
-        pipe = CosmosTextToWorldPipeline(
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            transformer=transformer,
-            vae=vae,
-            scheduler=scheduler,
-        )
-        pipe.save_pretrained(args.output_path, safe_serialization=True, max_shard_size="5GB")
@@ -148,7 +148,6 @@ else:
            "AutoencoderKL",
            "AutoencoderKLAllegro",
            "AutoencoderKLCogVideoX",
-            "AutoencoderKLCosmos",
            "AutoencoderKLHunyuanVideo",
            "AutoencoderKLLTXVideo",
            "AutoencoderKLMagvit",
@@ -167,7 +166,6 @@ else:
            "ControlNetModel",
            "ControlNetUnionModel",
            "ControlNetXSAdapter",
-            "CosmosTransformer3DModel",
            "DiTTransformer2DModel",
            "EasyAnimateTransformer3DModel",
            "FluxControlNetModel",
@@ -177,7 +175,6 @@ else:
            "HunyuanDiT2DControlNetModel",
            "HunyuanDiT2DModel",
            "HunyuanDiT2DMultiControlNetModel",
-            "HunyuanVideoFramepackTransformer3DModel",
            "HunyuanVideoTransformer3DModel",
            "I2VGenXLUNet",
            "Kandinsky3UNet",
@@ -359,9 +356,6 @@ else:
            "CogView3PlusPipeline",
            "CogView4ControlPipeline",
            "CogView4Pipeline",
-            "ConsisIDPipeline",
-            "CosmosTextToWorldPipeline",
-            "CosmosVideoToWorldPipeline",
            "CycleDiffusionPipeline",
            "EasyAnimateControlPipeline",
            "EasyAnimateInpaintPipeline",
@@ -382,7 +376,6 @@ else:
            "HunyuanDiTPAGPipeline",
            "HunyuanDiTPipeline",
            "HunyuanSkyreelsImageToVideoPipeline",
-            "HunyuanVideoFramepackPipeline",
            "HunyuanVideoImageToVideoPipeline",
            "HunyuanVideoPipeline",
            "I2VGenXLPipeline",
@@ -750,7 +743,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            AutoencoderKL,
            AutoencoderKLAllegro,
            AutoencoderKLCogVideoX,
-            AutoencoderKLCosmos,
            AutoencoderKLHunyuanVideo,
            AutoencoderKLLTXVideo,
            AutoencoderKLMagvit,
@@ -769,7 +761,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            ControlNetModel,
            ControlNetUnionModel,
            ControlNetXSAdapter,
-            CosmosTransformer3DModel,
            DiTTransformer2DModel,
            EasyAnimateTransformer3DModel,
            FluxControlNetModel,
@@ -779,7 +770,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            HunyuanDiT2DControlNetModel,
            HunyuanDiT2DModel,
            HunyuanDiT2DMultiControlNetModel,
-            HunyuanVideoFramepackTransformer3DModel,
            HunyuanVideoTransformer3DModel,
            I2VGenXLUNet,
            Kandinsky3UNet,
@@ -940,9 +930,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            CogView3PlusPipeline,
            CogView4ControlPipeline,
            CogView4Pipeline,
-            ConsisIDPipeline,
-            CosmosTextToWorldPipeline,
-            CosmosVideoToWorldPipeline,
            CycleDiffusionPipeline,
            EasyAnimateControlPipeline,
            EasyAnimateInpaintPipeline,
@@ -963,7 +950,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            HunyuanDiTPAGPipeline,
            HunyuanDiTPipeline,
            HunyuanSkyreelsImageToVideoPipeline,
-            HunyuanVideoFramepackPipeline,
            HunyuanVideoImageToVideoPipeline,
            HunyuanVideoPipeline,
            I2VGenXLPipeline,
@@ -348,7 +348,7 @@ def _load_lora_into_text_encoder(

    # Load the layers corresponding to text encoder and make necessary adjustments.
    if prefix is not None:
-        state_dict = {k.removeprefix(f"{prefix}."): v for k, v in state_dict.items() if k.startswith(f"{prefix}.")}
+        state_dict = {k[len(f"{prefix}.") :]: v for k, v in state_dict.items() if k.startswith(f"{prefix}.")}

    if len(state_dict) > 0:
        logger.info(f"Loading {prefix}.")
@@ -374,7 +374,7 @@ def _load_lora_into_text_encoder(

        if network_alphas is not None:
            alpha_keys = [k for k in network_alphas.keys() if k.startswith(prefix) and k.split(".")[0] == prefix]
-            network_alphas = {k.removeprefix(f"{prefix}."): v for k, v in network_alphas.items() if k in alpha_keys}
+            network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}

        lora_config_kwargs = get_peft_kwargs(rank, network_alphas, state_dict, is_unet=False)

@@ -727,25 +727,8 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
            elif k.startswith("lora_te1_"):
                has_te_keys = True
                continue
-            elif k.startswith("lora_transformer_context_embedder"):
-                diffusers_key = "context_embedder"
-            elif k.startswith("lora_transformer_norm_out_linear"):
-                diffusers_key = "norm_out.linear"
-            elif k.startswith("lora_transformer_proj_out"):
-                diffusers_key = "proj_out"
-            elif k.startswith("lora_transformer_x_embedder"):
-                diffusers_key = "x_embedder"
-            elif k.startswith("lora_transformer_time_text_embed_guidance_embedder_linear_"):
-                i = int(k.split("lora_transformer_time_text_embed_guidance_embedder_linear_")[-1])
-                diffusers_key = f"time_text_embed.guidance_embedder.linear_{i}"
-            elif k.startswith("lora_transformer_time_text_embed_text_embedder_linear_"):
-                i = int(k.split("lora_transformer_time_text_embed_text_embedder_linear_")[-1])
-                diffusers_key = f"time_text_embed.text_embedder.linear_{i}"
-            elif k.startswith("lora_transformer_time_text_embed_timestep_embedder_linear_"):
-                i = int(k.split("lora_transformer_time_text_embed_timestep_embedder_linear_")[-1])
-                diffusers_key = f"time_text_embed.timestep_embedder.linear_{i}"
            else:
-                raise NotImplementedError(f"Handling for key ({k}) is not implemented.")
+                raise NotImplementedError

            if "attn_" in k:
                if "_to_out_0" in k:
@@ -1704,11 +1687,3 @@ def _convert_musubi_wan_lora_to_diffusers(state_dict):
        converted_state_dict[f"transformer.{key}"] = converted_state_dict.pop(key)

    return converted_state_dict
-
-
-def _convert_non_diffusers_hidream_lora_to_diffusers(state_dict, non_diffusers_prefix="diffusion_model"):
-    if not all(k.startswith(non_diffusers_prefix) for k in state_dict):
-        raise ValueError("Invalid LoRA state dict for HiDream.")
-    converted_state_dict = {k.removeprefix(f"{non_diffusers_prefix}."): v for k, v in state_dict.items()}
-    converted_state_dict = {f"transformer.{k}": v for k, v in converted_state_dict.items()}
-    return converted_state_dict
@@ -43,7 +43,6 @@ from .lora_conversion_utils import (
    _convert_hunyuan_video_lora_to_diffusers,
    _convert_kohya_flux_lora_to_diffusers,
    _convert_musubi_wan_lora_to_diffusers,
-    _convert_non_diffusers_hidream_lora_to_diffusers,
    _convert_non_diffusers_lora_to_diffusers,
    _convert_non_diffusers_lumina2_lora_to_diffusers,
    _convert_non_diffusers_wan_lora_to_diffusers,
@@ -2104,7 +2103,7 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
        prefix = prefix or cls.transformer_name
        for key in list(state_dict.keys()):
            if key.split(".")[0] == prefix:
-                state_dict[key.removeprefix(f"{prefix}.")] = state_dict.pop(key)
+                state_dict[key[len(f"{prefix}.") :]] = state_dict.pop(key)

        # Find invalid keys
        transformer_state_dict = transformer.state_dict()
@@ -2426,7 +2425,7 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
        prefix = prefix or cls.transformer_name
        for key in list(state_dict.keys()):
            if key.split(".")[0] == prefix:
-                state_dict[key.removeprefix(f"{prefix}.")] = state_dict.pop(key)
+                state_dict[key[len(f"{prefix}.") :]] = state_dict.pop(key)

        # Expand transformer parameter shapes if they don't match lora
        has_param_with_shape_update = False
@@ -5372,6 +5371,7 @@ class HiDreamImageLoraLoaderMixin(LoraBaseMixin):

    @classmethod
    @validate_hf_hub_args
+    # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.lora_state_dict
    def lora_state_dict(
        cls,
        pretrained_model_name_or_path_or_dict: Union[str, Dict[str, torch.Tensor]],
@@ -5465,10 +5465,6 @@ class HiDreamImageLoraLoaderMixin(LoraBaseMixin):
            logger.warning(warn_msg)
            state_dict = {k: v for k, v in state_dict.items() if "dora_scale" not in k}

-        is_non_diffusers_format = any("diffusion_model" in k for k in state_dict)
-        if is_non_diffusers_format:
-            state_dict = _convert_non_diffusers_hidream_lora_to_diffusers(state_dict)
-
        return state_dict

    # Copied from diffusers.loaders.lora_pipeline.CogVideoXLoraLoaderMixin.load_lora_weights
@@ -230,7 +230,7 @@ class PeftAdapterMixin:
            raise ValueError("`network_alphas` cannot be None when `prefix` is None.")

        if prefix is not None:
-            state_dict = {k.removeprefix(f"{prefix}."): v for k, v in state_dict.items() if k.startswith(f"{prefix}.")}
+            state_dict = {k[len(f"{prefix}.") :]: v for k, v in state_dict.items() if k.startswith(f"{prefix}.")}

        if len(state_dict) > 0:
            if adapter_name in getattr(self, "peft_config", {}) and not hotswap:
@@ -261,9 +261,7 @@ class PeftAdapterMixin:

            if network_alphas is not None and len(network_alphas) >= 1:
                alpha_keys = [k for k in network_alphas.keys() if k.startswith(f"{prefix}.")]
-                network_alphas = {
-                    k.removeprefix(f"{prefix}."): v for k, v in network_alphas.items() if k in alpha_keys
-                }
+                network_alphas = {k.replace(f"{prefix}.", ""): v for k, v in network_alphas.items() if k in alpha_keys}

            lora_config_kwargs = get_peft_kwargs(rank, network_alpha_dict=network_alphas, peft_state_dict=state_dict)
            _maybe_raise_error_for_ambiguity(lora_config_kwargs)
@@ -32,7 +32,6 @@ if is_torch_available():
    _import_structure["autoencoders.autoencoder_kl"] = ["AutoencoderKL"]
    _import_structure["autoencoders.autoencoder_kl_allegro"] = ["AutoencoderKLAllegro"]
    _import_structure["autoencoders.autoencoder_kl_cogvideox"] = ["AutoencoderKLCogVideoX"]
-    _import_structure["autoencoders.autoencoder_kl_cosmos"] = ["AutoencoderKLCosmos"]
    _import_structure["autoencoders.autoencoder_kl_hunyuan_video"] = ["AutoencoderKLHunyuanVideo"]
    _import_structure["autoencoders.autoencoder_kl_ltx"] = ["AutoencoderKLLTXVideo"]
    _import_structure["autoencoders.autoencoder_kl_magvit"] = ["AutoencoderKLMagvit"]
@@ -76,12 +75,10 @@ if is_torch_available():
    _import_structure["transformers.transformer_allegro"] = ["AllegroTransformer3DModel"]
    _import_structure["transformers.transformer_cogview3plus"] = ["CogView3PlusTransformer2DModel"]
    _import_structure["transformers.transformer_cogview4"] = ["CogView4Transformer2DModel"]
-    _import_structure["transformers.transformer_cosmos"] = ["CosmosTransformer3DModel"]
    _import_structure["transformers.transformer_easyanimate"] = ["EasyAnimateTransformer3DModel"]
    _import_structure["transformers.transformer_flux"] = ["FluxTransformer2DModel"]
    _import_structure["transformers.transformer_hidream_image"] = ["HiDreamImageTransformer2DModel"]
    _import_structure["transformers.transformer_hunyuan_video"] = ["HunyuanVideoTransformer3DModel"]
-    _import_structure["transformers.transformer_hunyuan_video_framepack"] = ["HunyuanVideoFramepackTransformer3DModel"]
    _import_structure["transformers.transformer_ltx"] = ["LTXVideoTransformer3DModel"]
    _import_structure["transformers.transformer_lumina2"] = ["Lumina2Transformer2DModel"]
    _import_structure["transformers.transformer_mochi"] = ["MochiTransformer3DModel"]
@@ -116,7 +113,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            AutoencoderKL,
            AutoencoderKLAllegro,
            AutoencoderKLCogVideoX,
-            AutoencoderKLCosmos,
            AutoencoderKLHunyuanVideo,
            AutoencoderKLLTXVideo,
            AutoencoderKLMagvit,
@@ -154,14 +150,12 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            CogView3PlusTransformer2DModel,
            CogView4Transformer2DModel,
            ConsisIDTransformer3DModel,
-            CosmosTransformer3DModel,
            DiTTransformer2DModel,
            DualTransformer2DModel,
            EasyAnimateTransformer3DModel,
            FluxTransformer2DModel,
            HiDreamImageTransformer2DModel,
            HunyuanDiT2DModel,
-            HunyuanVideoFramepackTransformer3DModel,
            HunyuanVideoTransformer3DModel,
            LatteTransformer3DModel,
            LTXVideoTransformer3DModel,
@@ -203,8 +203,8 @@ class Attention(nn.Module):
            self.norm_q = nn.LayerNorm(dim_head * heads, eps=eps)
            self.norm_k = nn.LayerNorm(dim_head * kv_heads, eps=eps)
        elif qk_norm == "rms_norm":
-            self.norm_q = RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
-            self.norm_k = RMSNorm(dim_head, eps=eps, elementwise_affine=elementwise_affine)
+            self.norm_q = RMSNorm(dim_head, eps=eps)
+            self.norm_k = RMSNorm(dim_head, eps=eps)
        elif qk_norm == "rms_norm_across_heads":
            # LTX applies qk norm across all heads
            self.norm_q = RMSNorm(dim_head * heads, eps=eps)
@@ -3,7 +3,6 @@ from .autoencoder_dc import AutoencoderDC
 from .autoencoder_kl import AutoencoderKL
 from .autoencoder_kl_allegro import AutoencoderKLAllegro
 from .autoencoder_kl_cogvideox import AutoencoderKLCogVideoX
-from .autoencoder_kl_cosmos import AutoencoderKLCosmos
 from .autoencoder_kl_hunyuan_video import AutoencoderKLHunyuanVideo
 from .autoencoder_kl_ltx import AutoencoderKLLTXVideo
 from .autoencoder_kl_magvit import AutoencoderKLMagvit
@@ -744,17 +744,6 @@ class DiagonalGaussianDistribution(object):
        return self.mean


-class IdentityDistribution(object):
-    def __init__(self, parameters: torch.Tensor):
-        self.parameters = parameters
-
-    def sample(self, generator: Optional[torch.Generator] = None) -> torch.Tensor:
-        return self.parameters
-
-    def mode(self) -> torch.Tensor:
-        return self.parameters
-
-
 class EncoderTiny(nn.Module):
    r"""
    The `EncoderTiny` layer is a simpler version of the `Encoder` layer.
@@ -1204,7 +1204,7 @@ def apply_rotary_emb(
            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
        elif use_real_unbind_dim == -2:
-            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
+            # Used for Stable Audio, OmniGen and CogView4
            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
        else:
@@ -19,12 +19,10 @@ if is_torch_available():
    from .transformer_allegro import AllegroTransformer3DModel
    from .transformer_cogview3plus import CogView3PlusTransformer2DModel
    from .transformer_cogview4 import CogView4Transformer2DModel
-    from .transformer_cosmos import CosmosTransformer3DModel
    from .transformer_easyanimate import EasyAnimateTransformer3DModel
    from .transformer_flux import FluxTransformer2DModel
    from .transformer_hidream_image import HiDreamImageTransformer2DModel
    from .transformer_hunyuan_video import HunyuanVideoTransformer3DModel
-    from .transformer_hunyuan_video_framepack import HunyuanVideoFramepackTransformer3DModel
    from .transformer_ltx import LTXVideoTransformer3DModel
    from .transformer_lumina2 import Lumina2Transformer2DModel
    from .transformer_mochi import MochiTransformer3DModel
@@ -1,555 +0,0 @@
-# Copyright 2024 The NVIDIA Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Optional, Tuple
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...utils import is_torchvision_available
-from ..attention import FeedForward
-from ..attention_processor import Attention
-from ..embeddings import Timesteps
-from ..modeling_outputs import Transformer2DModelOutput
-from ..modeling_utils import ModelMixin
-from ..normalization import RMSNorm
-
-
-if is_torchvision_available():
-    from torchvision import transforms
-
-
-class CosmosPatchEmbed(nn.Module):
-    def __init__(
-        self, in_channels: int, out_channels: int, patch_size: Tuple[int, int, int], bias: bool = True
-    ) -> None:
-        super().__init__()
-        self.patch_size = patch_size
-
-        self.proj = nn.Linear(in_channels * patch_size[0] * patch_size[1] * patch_size[2], out_channels, bias=bias)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        batch_size, num_channels, num_frames, height, width = hidden_states.shape
-        p_t, p_h, p_w = self.patch_size
-        hidden_states = hidden_states.reshape(
-            batch_size, num_channels, num_frames // p_t, p_t, height // p_h, p_h, width // p_w, p_w
-        )
-        hidden_states = hidden_states.permute(0, 2, 4, 6, 1, 3, 5, 7).flatten(4, 7)
-        hidden_states = self.proj(hidden_states)
-        return hidden_states
-
-
-class CosmosTimestepEmbedding(nn.Module):
-    def __init__(self, in_features: int, out_features: int) -> None:
-        super().__init__()
-        self.linear_1 = nn.Linear(in_features, out_features, bias=False)
-        self.activation = nn.SiLU()
-        self.linear_2 = nn.Linear(out_features, 3 * out_features, bias=False)
-
-    def forward(self, timesteps: torch.Tensor) -> torch.Tensor:
-        emb = self.linear_1(timesteps)
-        emb = self.activation(emb)
-        emb = self.linear_2(emb)
-        return emb
-
-
-class CosmosEmbedding(nn.Module):
-    def __init__(self, embedding_dim: int, condition_dim: int) -> None:
-        super().__init__()
-
-        self.time_proj = Timesteps(embedding_dim, flip_sin_to_cos=True, downscale_freq_shift=0.0)
-        self.t_embedder = CosmosTimestepEmbedding(embedding_dim, condition_dim)
-        self.norm = RMSNorm(embedding_dim, eps=1e-6, elementwise_affine=True)
-
-    def forward(self, hidden_states: torch.Tensor, timestep: torch.LongTensor) -> torch.Tensor:
-        timesteps_proj = self.time_proj(timestep).type_as(hidden_states)
-        temb = self.t_embedder(timesteps_proj)
-        embedded_timestep = self.norm(timesteps_proj)
-        return temb, embedded_timestep
-
-
-class CosmosAdaLayerNorm(nn.Module):
-    def __init__(self, in_features: int, hidden_features: int) -> None:
-        super().__init__()
-        self.embedding_dim = in_features
-
-        self.activation = nn.SiLU()
-        self.norm = nn.LayerNorm(in_features, elementwise_affine=False, eps=1e-6)
-        self.linear_1 = nn.Linear(in_features, hidden_features, bias=False)
-        self.linear_2 = nn.Linear(hidden_features, 2 * in_features, bias=False)
-
-    def forward(
-        self, hidden_states: torch.Tensor, embedded_timestep: torch.Tensor, temb: Optional[torch.Tensor] = None
-    ) -> torch.Tensor:
-        embedded_timestep = self.activation(embedded_timestep)
-        embedded_timestep = self.linear_1(embedded_timestep)
-        embedded_timestep = self.linear_2(embedded_timestep)
-
-        if temb is not None:
-            embedded_timestep = embedded_timestep + temb[:, : 2 * self.embedding_dim]
-
-        shift, scale = embedded_timestep.chunk(2, dim=1)
-        hidden_states = self.norm(hidden_states)
-        hidden_states = hidden_states * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
-        return hidden_states
-
-
-class CosmosAdaLayerNormZero(nn.Module):
-    def __init__(self, in_features: int, hidden_features: Optional[int] = None) -> None:
-        super().__init__()
-
-        self.norm = nn.LayerNorm(in_features, elementwise_affine=False, eps=1e-6)
-        self.activation = nn.SiLU()
-
-        if hidden_features is None:
-            self.linear_1 = nn.Identity()
-        else:
-            self.linear_1 = nn.Linear(in_features, hidden_features, bias=False)
-
-        self.linear_2 = nn.Linear(hidden_features, 3 * in_features, bias=False)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        embedded_timestep: torch.Tensor,
-        temb: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        embedded_timestep = self.activation(embedded_timestep)
-        embedded_timestep = self.linear_1(embedded_timestep)
-        embedded_timestep = self.linear_2(embedded_timestep)
-
-        if temb is not None:
-            embedded_timestep = embedded_timestep + temb
-
-        shift, scale, gate = embedded_timestep.chunk(3, dim=1)
-        hidden_states = self.norm(hidden_states)
-        hidden_states = hidden_states * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)
-        return hidden_states, gate
-
-
-class CosmosAttnProcessor2_0:
-    def __init__(self):
-        if not hasattr(F, "scaled_dot_product_attention"):
-            raise ImportError("CosmosAttnProcessor2_0 requires PyTorch 2.0. To use it, please upgrade PyTorch to 2.0.")
-
-    def __call__(
-        self,
-        attn: Attention,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        # 1. QKV projections
-        if encoder_hidden_states is None:
-            encoder_hidden_states = hidden_states
-
-        query = attn.to_q(hidden_states)
-        key = attn.to_k(encoder_hidden_states)
-        value = attn.to_v(encoder_hidden_states)
-
-        query = query.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-        key = key.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-        value = value.unflatten(2, (attn.heads, -1)).transpose(1, 2)
-
-        # 2. QK normalization
-        query = attn.norm_q(query)
-        key = attn.norm_k(key)
-
-        # 3. Apply RoPE
-        if image_rotary_emb is not None:
-            from ..embeddings import apply_rotary_emb
-
-            query = apply_rotary_emb(query, image_rotary_emb, use_real=True, use_real_unbind_dim=-2)
-            key = apply_rotary_emb(key, image_rotary_emb, use_real=True, use_real_unbind_dim=-2)
-
-        # 4. Prepare for GQA
-        query_idx = torch.tensor(query.size(3), device=query.device)
-        key_idx = torch.tensor(key.size(3), device=key.device)
-        value_idx = torch.tensor(value.size(3), device=value.device)
-        key = key.repeat_interleave(query_idx // key_idx, dim=3)
-        value = value.repeat_interleave(query_idx // value_idx, dim=3)
-
-        # 5. Attention
-        hidden_states = F.scaled_dot_product_attention(
-            query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False
-        )
-        hidden_states = hidden_states.transpose(1, 2).flatten(2, 3).type_as(query)
-
-        # 6. Output projection
-        hidden_states = attn.to_out[0](hidden_states)
-        hidden_states = attn.to_out[1](hidden_states)
-
-        return hidden_states
-
-
-class CosmosTransformerBlock(nn.Module):
-    def __init__(
-        self,
-        num_attention_heads: int,
-        attention_head_dim: int,
-        cross_attention_dim: int,
-        mlp_ratio: float = 4.0,
-        adaln_lora_dim: int = 256,
-        qk_norm: str = "rms_norm",
-        out_bias: bool = False,
-    ) -> None:
-        super().__init__()
-
-        hidden_size = num_attention_heads * attention_head_dim
-
-        self.norm1 = CosmosAdaLayerNormZero(in_features=hidden_size, hidden_features=adaln_lora_dim)
-        self.attn1 = Attention(
-            query_dim=hidden_size,
-            cross_attention_dim=None,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            qk_norm=qk_norm,
-            elementwise_affine=True,
-            out_bias=out_bias,
-            processor=CosmosAttnProcessor2_0(),
-        )
-
-        self.norm2 = CosmosAdaLayerNormZero(in_features=hidden_size, hidden_features=adaln_lora_dim)
-        self.attn2 = Attention(
-            query_dim=hidden_size,
-            cross_attention_dim=cross_attention_dim,
-            heads=num_attention_heads,
-            dim_head=attention_head_dim,
-            qk_norm=qk_norm,
-            elementwise_affine=True,
-            out_bias=out_bias,
-            processor=CosmosAttnProcessor2_0(),
-        )
-
-        self.norm3 = CosmosAdaLayerNormZero(in_features=hidden_size, hidden_features=adaln_lora_dim)
-        self.ff = FeedForward(hidden_size, mult=mlp_ratio, activation_fn="gelu", bias=out_bias)
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        embedded_timestep: torch.Tensor,
-        temb: Optional[torch.Tensor] = None,
-        image_rotary_emb: Optional[torch.Tensor] = None,
-        extra_pos_emb: Optional[torch.Tensor] = None,
-        attention_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if extra_pos_emb is not None:
-            hidden_states = hidden_states + extra_pos_emb
-
-        # 1. Self Attention
-        norm_hidden_states, gate = self.norm1(hidden_states, embedded_timestep, temb)
-        attn_output = self.attn1(norm_hidden_states, image_rotary_emb=image_rotary_emb)
-        hidden_states = hidden_states + gate.unsqueeze(1) * attn_output
-
-        # 2. Cross Attention
-        norm_hidden_states, gate = self.norm2(hidden_states, embedded_timestep, temb)
-        attn_output = self.attn2(
-            norm_hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask
-        )
-        hidden_states = hidden_states + gate.unsqueeze(1) * attn_output
-
-        # 3. Feed Forward
-        norm_hidden_states, gate = self.norm3(hidden_states, embedded_timestep, temb)
-        ff_output = self.ff(norm_hidden_states)
-        hidden_states = hidden_states + gate.unsqueeze(1) * ff_output
-
-        return hidden_states
-
-
-class CosmosRotaryPosEmbed(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        max_size: Tuple[int, int, int] = (128, 240, 240),
-        patch_size: Tuple[int, int, int] = (1, 2, 2),
-        base_fps: int = 24,
-        rope_scale: Tuple[float, float, float] = (2.0, 1.0, 1.0),
-    ) -> None:
-        super().__init__()
-
-        self.max_size = [size // patch for size, patch in zip(max_size, patch_size)]
-        self.patch_size = patch_size
-        self.base_fps = base_fps
-
-        self.dim_h = hidden_size // 6 * 2
-        self.dim_w = hidden_size // 6 * 2
-        self.dim_t = hidden_size - self.dim_h - self.dim_w
-
-        self.h_ntk_factor = rope_scale[1] ** (self.dim_h / (self.dim_h - 2))
-        self.w_ntk_factor = rope_scale[2] ** (self.dim_w / (self.dim_w - 2))
-        self.t_ntk_factor = rope_scale[0] ** (self.dim_t / (self.dim_t - 2))
-
-    def forward(self, hidden_states: torch.Tensor, fps: Optional[int] = None) -> Tuple[torch.Tensor, torch.Tensor]:
-        batch_size, num_channels, num_frames, height, width = hidden_states.shape
-        pe_size = [num_frames // self.patch_size[0], height // self.patch_size[1], width // self.patch_size[2]]
-        device = hidden_states.device
-
-        h_theta = 10000.0 * self.h_ntk_factor
-        w_theta = 10000.0 * self.w_ntk_factor
-        t_theta = 10000.0 * self.t_ntk_factor
-
-        seq = torch.arange(max(self.max_size), device=device, dtype=torch.float32)
-        dim_h_range = (
-            torch.arange(0, self.dim_h, 2, device=device, dtype=torch.float32)[: (self.dim_h // 2)] / self.dim_h
-        )
-        dim_w_range = (
-            torch.arange(0, self.dim_w, 2, device=device, dtype=torch.float32)[: (self.dim_w // 2)] / self.dim_w
-        )
-        dim_t_range = (
-            torch.arange(0, self.dim_t, 2, device=device, dtype=torch.float32)[: (self.dim_t // 2)] / self.dim_t
-        )
-        h_spatial_freqs = 1.0 / (h_theta**dim_h_range)
-        w_spatial_freqs = 1.0 / (w_theta**dim_w_range)
-        temporal_freqs = 1.0 / (t_theta**dim_t_range)
-
-        emb_h = torch.outer(seq[: pe_size[1]], h_spatial_freqs)[None, :, None, :].repeat(pe_size[0], 1, pe_size[2], 1)
-        emb_w = torch.outer(seq[: pe_size[2]], w_spatial_freqs)[None, None, :, :].repeat(pe_size[0], pe_size[1], 1, 1)
-
-        # Apply sequence scaling in temporal dimension
-        if fps is None:
-            # Images
-            emb_t = torch.outer(seq[: pe_size[0]], temporal_freqs)
-        else:
-            # Videos
-            emb_t = torch.outer(seq[: pe_size[0]] / fps * self.base_fps, temporal_freqs)
-
-        emb_t = emb_t[:, None, None, :].repeat(1, pe_size[1], pe_size[2], 1)
-        freqs = torch.cat([emb_t, emb_h, emb_w] * 2, dim=-1).flatten(0, 2).float()
-        cos = torch.cos(freqs)
-        sin = torch.sin(freqs)
-        return cos, sin
-
-
-class CosmosLearnablePositionalEmbed(nn.Module):
-    def __init__(
-        self,
-        hidden_size: int,
-        max_size: Tuple[int, int, int],
-        patch_size: Tuple[int, int, int],
-        eps: float = 1e-6,
-    ) -> None:
-        super().__init__()
-
-        self.max_size = [size // patch for size, patch in zip(max_size, patch_size)]
-        self.patch_size = patch_size
-        self.eps = eps
-
-        self.pos_emb_t = nn.Parameter(torch.zeros(self.max_size[0], hidden_size))
-        self.pos_emb_h = nn.Parameter(torch.zeros(self.max_size[1], hidden_size))
-        self.pos_emb_w = nn.Parameter(torch.zeros(self.max_size[2], hidden_size))
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        batch_size, num_channels, num_frames, height, width = hidden_states.shape
-        pe_size = [num_frames // self.patch_size[0], height // self.patch_size[1], width // self.patch_size[2]]
-
-        emb_t = self.pos_emb_t[: pe_size[0]][None, :, None, None, :].repeat(batch_size, 1, pe_size[1], pe_size[2], 1)
-        emb_h = self.pos_emb_h[: pe_size[1]][None, None, :, None, :].repeat(batch_size, pe_size[0], 1, pe_size[2], 1)
-        emb_w = self.pos_emb_w[: pe_size[2]][None, None, None, :, :].repeat(batch_size, pe_size[0], pe_size[1], 1, 1)
-        emb = emb_t + emb_h + emb_w
-        emb = emb.flatten(1, 3)
-
-        norm = torch.linalg.vector_norm(emb, dim=-1, keepdim=True, dtype=torch.float32)
-        norm = torch.add(self.eps, norm, alpha=np.sqrt(norm.numel() / emb.numel()))
-        return (emb / norm).type_as(hidden_states)
-
-
-class CosmosTransformer3DModel(ModelMixin, ConfigMixin):
-    r"""
-    A Transformer model for video-like data used in [Cosmos](https://github.com/NVIDIA/Cosmos).
-
-    Args:
-        in_channels (`int`, defaults to `16`):
-            The number of channels in the input.
-        out_channels (`int`, defaults to `16`):
-            The number of channels in the output.
-        num_attention_heads (`int`, defaults to `32`):
-            The number of heads to use for multi-head attention.
-        attention_head_dim (`int`, defaults to `128`):
-            The number of channels in each attention head.
-        num_layers (`int`, defaults to `28`):
-            The number of layers of transformer blocks to use.
-        mlp_ratio (`float`, defaults to `4.0`):
-            The ratio of the hidden layer size to the input size in the feedforward network.
-        text_embed_dim (`int`, defaults to `4096`):
-            Input dimension of text embeddings from the text encoder.
-        adaln_lora_dim (`int`, defaults to `256`):
-            The hidden dimension of the Adaptive LayerNorm LoRA layer.
-        max_size (`Tuple[int, int, int]`, defaults to `(128, 240, 240)`):
-            The maximum size of the input latent tensors in the temporal, height, and width dimensions.
-        patch_size (`Tuple[int, int, int]`, defaults to `(1, 2, 2)`):
-            The patch size to use for patchifying the input latent tensors in the temporal, height, and width
-            dimensions.
-        rope_scale (`Tuple[float, float, float]`, defaults to `(2.0, 1.0, 1.0)`):
-            The scaling factor to use for RoPE in the temporal, height, and width dimensions.
-        concat_padding_mask (`bool`, defaults to `True`):
-            Whether to concatenate the padding mask to the input latent tensors.
-        extra_pos_embed_type (`str`, *optional*, defaults to `learnable`):
-            The type of extra positional embeddings to use. Can be one of `None` or `learnable`.
-    """
-
-    _supports_gradient_checkpointing = True
-    _skip_layerwise_casting_patterns = ["patch_embed", "final_layer", "norm"]
-    _no_split_modules = ["CosmosTransformerBlock"]
-    _keep_in_fp32_modules = ["learnable_pos_embed"]
-
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 16,
-        out_channels: int = 16,
-        num_attention_heads: int = 32,
-        attention_head_dim: int = 128,
-        num_layers: int = 28,
-        mlp_ratio: float = 4.0,
-        text_embed_dim: int = 1024,
-        adaln_lora_dim: int = 256,
-        max_size: Tuple[int, int, int] = (128, 240, 240),
-        patch_size: Tuple[int, int, int] = (1, 2, 2),
-        rope_scale: Tuple[float, float, float] = (2.0, 1.0, 1.0),
-        concat_padding_mask: bool = True,
-        extra_pos_embed_type: Optional[str] = "learnable",
-    ) -> None:
-        super().__init__()
-        hidden_size = num_attention_heads * attention_head_dim
-
-        # 1. Patch Embedding
-        patch_embed_in_channels = in_channels + 1 if concat_padding_mask else in_channels
-        self.patch_embed = CosmosPatchEmbed(patch_embed_in_channels, hidden_size, patch_size, bias=False)
-
-        # 2. Positional Embedding
-        self.rope = CosmosRotaryPosEmbed(
-            hidden_size=attention_head_dim, max_size=max_size, patch_size=patch_size, rope_scale=rope_scale
-        )
-
-        self.learnable_pos_embed = None
-        if extra_pos_embed_type == "learnable":
-            self.learnable_pos_embed = CosmosLearnablePositionalEmbed(
-                hidden_size=hidden_size,
-                max_size=max_size,
-                patch_size=patch_size,
-            )
-
-        # 3. Time Embedding
-        self.time_embed = CosmosEmbedding(hidden_size, hidden_size)
-
-        # 4. Transformer Blocks
-        self.transformer_blocks = nn.ModuleList(
-            [
-                CosmosTransformerBlock(
-                    num_attention_heads=num_attention_heads,
-                    attention_head_dim=attention_head_dim,
-                    cross_attention_dim=text_embed_dim,
-                    mlp_ratio=mlp_ratio,
-                    adaln_lora_dim=adaln_lora_dim,
-                    qk_norm="rms_norm",
-                    out_bias=False,
-                )
-                for _ in range(num_layers)
-            ]
-        )
-
-        # 5. Output norm & projection
-        self.norm_out = CosmosAdaLayerNorm(hidden_size, adaln_lora_dim)
-        self.proj_out = nn.Linear(
-            hidden_size, patch_size[0] * patch_size[1] * patch_size[2] * out_channels, bias=False
-        )
-
-        self.gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        timestep: torch.Tensor,
-        encoder_hidden_states: torch.Tensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        fps: Optional[int] = None,
-        condition_mask: Optional[torch.Tensor] = None,
-        padding_mask: Optional[torch.Tensor] = None,
-        return_dict: bool = True,
-    ) -> torch.Tensor:
-        batch_size, num_channels, num_frames, height, width = hidden_states.shape
-
-        # 1. Concatenate padding mask if needed & prepare attention mask
-        if condition_mask is not None:
-            hidden_states = torch.cat([hidden_states, condition_mask], dim=1)
-
-        if self.config.concat_padding_mask:
-            padding_mask = transforms.functional.resize(
-                padding_mask, list(hidden_states.shape[-2:]), interpolation=transforms.InterpolationMode.NEAREST
-            )
-            hidden_states = torch.cat(
-                [hidden_states, padding_mask.unsqueeze(2).repeat(batch_size, 1, num_frames, 1, 1)], dim=1
-            )
-
-        if attention_mask is not None:
-            attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, S]
-
-        # 2. Generate positional embeddings
-        image_rotary_emb = self.rope(hidden_states, fps=fps)
-        extra_pos_emb = self.learnable_pos_embed(hidden_states) if self.config.extra_pos_embed_type else None
-
-        # 3. Patchify input
-        p_t, p_h, p_w = self.config.patch_size
-        post_patch_num_frames = num_frames // p_t
-        post_patch_height = height // p_h
-        post_patch_width = width // p_w
-        hidden_states = self.patch_embed(hidden_states)
-        hidden_states = hidden_states.flatten(1, 3)  # [B, T, H, W, C] -> [B, THW, C]
-
-        # 4. Timestep embeddings
-        temb, embedded_timestep = self.time_embed(hidden_states, timestep)
-
-        # 5. Transformer blocks
-        for block in self.transformer_blocks:
-            if torch.is_grad_enabled() and self.gradient_checkpointing:
-                hidden_states = self._gradient_checkpointing_func(
-                    block,
-                    hidden_states,
-                    encoder_hidden_states,
-                    embedded_timestep,
-                    temb,
-                    image_rotary_emb,
-                    extra_pos_emb,
-                    attention_mask,
-                )
-            else:
-                hidden_states = block(
-                    hidden_states=hidden_states,
-                    encoder_hidden_states=encoder_hidden_states,
-                    embedded_timestep=embedded_timestep,
-                    temb=temb,
-                    image_rotary_emb=image_rotary_emb,
-                    extra_pos_emb=extra_pos_emb,
-                    attention_mask=attention_mask,
-                )
-
-        # 6. Output norm & projection & unpatchify
-        hidden_states = self.norm_out(hidden_states, embedded_timestep, temb)
-        hidden_states = self.proj_out(hidden_states)
-        hidden_states = hidden_states.unflatten(2, (p_h, p_w, p_t, -1))
-        hidden_states = hidden_states.unflatten(1, (post_patch_num_frames, post_patch_height, post_patch_width))
-        # Please just kill me at this point. What even is this permutation order and why is it different from the patching order?
-        # Another few hours of sanity lost to the void.
-        hidden_states = hidden_states.permute(0, 7, 1, 6, 2, 4, 3, 5)
-        hidden_states = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
-
-        if not return_dict:
-            return (hidden_states,)
-
-        return Transformer2DModelOutput(sample=hidden_states)
@@ -1,413 +0,0 @@
-# Copyright 2025 The Framepack Team, The Hunyuan Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import Any, Dict, List, Optional, Tuple
-
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-from ...configuration_utils import ConfigMixin, register_to_config
-from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
-from ...utils import USE_PEFT_BACKEND, get_logger, scale_lora_layers, unscale_lora_layers
-from ..cache_utils import CacheMixin
-from ..embeddings import get_1d_rotary_pos_embed
-from ..modeling_outputs import Transformer2DModelOutput
-from ..modeling_utils import ModelMixin
-from ..normalization import AdaLayerNormContinuous
-from .transformer_hunyuan_video import (
-    HunyuanVideoConditionEmbedding,
-    HunyuanVideoPatchEmbed,
-    HunyuanVideoSingleTransformerBlock,
-    HunyuanVideoTokenRefiner,
-    HunyuanVideoTransformerBlock,
-)
-
-
-logger = get_logger(__name__)  # pylint: disable=invalid-name
-
-
-class HunyuanVideoFramepackRotaryPosEmbed(nn.Module):
-    def __init__(self, patch_size: int, patch_size_t: int, rope_dim: List[int], theta: float = 256.0) -> None:
-        super().__init__()
-
-        self.patch_size = patch_size
-        self.patch_size_t = patch_size_t
-        self.rope_dim = rope_dim
-        self.theta = theta
-
-    def forward(self, frame_indices: torch.Tensor, height: int, width: int, device: torch.device):
-        height = height // self.patch_size
-        width = width // self.patch_size
-        grid = torch.meshgrid(
-            frame_indices.to(device=device, dtype=torch.float32),
-            torch.arange(0, height, device=device, dtype=torch.float32),
-            torch.arange(0, width, device=device, dtype=torch.float32),
-            indexing="ij",
-        )  # 3 * [W, H, T]
-        grid = torch.stack(grid, dim=0)  # [3, W, H, T]
-
-        freqs = []
-        for i in range(3):
-            freq = get_1d_rotary_pos_embed(self.rope_dim[i], grid[i].reshape(-1), self.theta, use_real=True)
-            freqs.append(freq)
-
-        freqs_cos = torch.cat([f[0] for f in freqs], dim=1)  # (W * H * T, D / 2)
-        freqs_sin = torch.cat([f[1] for f in freqs], dim=1)  # (W * H * T, D / 2)
-
-        return freqs_cos, freqs_sin
-
-
-class FramepackClipVisionProjection(nn.Module):
-    def __init__(self, in_channels: int, out_channels: int):
-        super().__init__()
-        self.up = nn.Linear(in_channels, out_channels * 3)
-        self.down = nn.Linear(out_channels * 3, out_channels)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.up(hidden_states)
-        hidden_states = F.silu(hidden_states)
-        hidden_states = self.down(hidden_states)
-        return hidden_states
-
-
-class HunyuanVideoHistoryPatchEmbed(nn.Module):
-    def __init__(self, in_channels: int, inner_dim: int):
-        super().__init__()
-        self.proj = nn.Conv3d(in_channels, inner_dim, kernel_size=(1, 2, 2), stride=(1, 2, 2))
-        self.proj_2x = nn.Conv3d(in_channels, inner_dim, kernel_size=(2, 4, 4), stride=(2, 4, 4))
-        self.proj_4x = nn.Conv3d(in_channels, inner_dim, kernel_size=(4, 8, 8), stride=(4, 8, 8))
-
-    def forward(
-        self,
-        latents_clean: Optional[torch.Tensor] = None,
-        latents_clean_2x: Optional[torch.Tensor] = None,
-        latents_clean_4x: Optional[torch.Tensor] = None,
-    ):
-        if latents_clean is not None:
-            latents_clean = self.proj(latents_clean)
-            latents_clean = latents_clean.flatten(2).transpose(1, 2)
-        if latents_clean_2x is not None:
-            latents_clean_2x = _pad_for_3d_conv(latents_clean_2x, (2, 4, 4))
-            latents_clean_2x = self.proj_2x(latents_clean_2x)
-            latents_clean_2x = latents_clean_2x.flatten(2).transpose(1, 2)
-        if latents_clean_4x is not None:
-            latents_clean_4x = _pad_for_3d_conv(latents_clean_4x, (4, 8, 8))
-            latents_clean_4x = self.proj_4x(latents_clean_4x)
-            latents_clean_4x = latents_clean_4x.flatten(2).transpose(1, 2)
-        return latents_clean, latents_clean_2x, latents_clean_4x
-
-
-class HunyuanVideoFramepackTransformer3DModel(
-    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin
-):
-    _supports_gradient_checkpointing = True
-    _skip_layerwise_casting_patterns = ["x_embedder", "context_embedder", "norm"]
-    _no_split_modules = [
-        "HunyuanVideoTransformerBlock",
-        "HunyuanVideoSingleTransformerBlock",
-        "HunyuanVideoHistoryPatchEmbed",
-        "HunyuanVideoTokenRefiner",
-    ]
-
-    @register_to_config
-    def __init__(
-        self,
-        in_channels: int = 16,
-        out_channels: int = 16,
-        num_attention_heads: int = 24,
-        attention_head_dim: int = 128,
-        num_layers: int = 20,
-        num_single_layers: int = 40,
-        num_refiner_layers: int = 2,
-        mlp_ratio: float = 4.0,
-        patch_size: int = 2,
-        patch_size_t: int = 1,
-        qk_norm: str = "rms_norm",
-        guidance_embeds: bool = True,
-        text_embed_dim: int = 4096,
-        pooled_projection_dim: int = 768,
-        rope_theta: float = 256.0,
-        rope_axes_dim: Tuple[int] = (16, 56, 56),
-        image_condition_type: Optional[str] = None,
-        has_image_proj: int = False,
-        image_proj_dim: int = 1152,
-        has_clean_x_embedder: int = False,
-    ) -> None:
-        super().__init__()
-
-        inner_dim = num_attention_heads * attention_head_dim
-        out_channels = out_channels or in_channels
-
-        # 1. Latent and condition embedders
-        self.x_embedder = HunyuanVideoPatchEmbed((patch_size_t, patch_size, patch_size), in_channels, inner_dim)
-        self.context_embedder = HunyuanVideoTokenRefiner(
-            text_embed_dim, num_attention_heads, attention_head_dim, num_layers=num_refiner_layers
-        )
-        self.time_text_embed = HunyuanVideoConditionEmbedding(
-            inner_dim, pooled_projection_dim, guidance_embeds, image_condition_type
-        )
-
-        # 2. RoPE
-        self.rope = HunyuanVideoFramepackRotaryPosEmbed(patch_size, patch_size_t, rope_axes_dim, rope_theta)
-
-        # 3. Dual stream transformer blocks
-        self.transformer_blocks = nn.ModuleList(
-            [
-                HunyuanVideoTransformerBlock(
-                    num_attention_heads, attention_head_dim, mlp_ratio=mlp_ratio, qk_norm=qk_norm
-                )
-                for _ in range(num_layers)
-            ]
-        )
-
-        # 4. Single stream transformer blocks
-        self.single_transformer_blocks = nn.ModuleList(
-            [
-                HunyuanVideoSingleTransformerBlock(
-                    num_attention_heads, attention_head_dim, mlp_ratio=mlp_ratio, qk_norm=qk_norm
-                )
-                for _ in range(num_single_layers)
-            ]
-        )
-
-        # 5. Output projection
-        self.norm_out = AdaLayerNormContinuous(inner_dim, inner_dim, elementwise_affine=False, eps=1e-6)
-        self.proj_out = nn.Linear(inner_dim, patch_size_t * patch_size * patch_size * out_channels)
-
-        # Framepack specific modules
-        self.image_projection = FramepackClipVisionProjection(image_proj_dim, inner_dim) if has_image_proj else None
-
-        self.clean_x_embedder = None
-        if has_clean_x_embedder:
-            self.clean_x_embedder = HunyuanVideoHistoryPatchEmbed(in_channels, inner_dim)
-
-        self.use_gradient_checkpointing = False
-
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        timestep: torch.LongTensor,
-        encoder_hidden_states: torch.Tensor,
-        encoder_attention_mask: torch.Tensor,
-        pooled_projections: torch.Tensor,
-        image_embeds: torch.Tensor,
-        indices_latents: torch.Tensor,
-        guidance: Optional[torch.Tensor] = None,
-        latents_clean: Optional[torch.Tensor] = None,
-        indices_latents_clean: Optional[torch.Tensor] = None,
-        latents_history_2x: Optional[torch.Tensor] = None,
-        indices_latents_history_2x: Optional[torch.Tensor] = None,
-        latents_history_4x: Optional[torch.Tensor] = None,
-        indices_latents_history_4x: Optional[torch.Tensor] = None,
-        attention_kwargs: Optional[Dict[str, Any]] = None,
-        return_dict: bool = True,
-    ):
-        if attention_kwargs is not None:
-            attention_kwargs = attention_kwargs.copy()
-            lora_scale = attention_kwargs.pop("scale", 1.0)
-        else:
-            lora_scale = 1.0
-
-        if USE_PEFT_BACKEND:
-            # weight the lora layers by setting `lora_scale` for each PEFT layer
-            scale_lora_layers(self, lora_scale)
-        else:
-            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
-                logger.warning(
-                    "Passing `scale` via `attention_kwargs` when not using the PEFT backend is ineffective."
-                )
-
-        batch_size, num_channels, num_frames, height, width = hidden_states.shape
-        p, p_t = self.config.patch_size, self.config.patch_size_t
-        post_patch_num_frames = num_frames // p_t
-        post_patch_height = height // p
-        post_patch_width = width // p
-        original_context_length = post_patch_num_frames * post_patch_height * post_patch_width
-
-        if indices_latents is None:
-            indices_latents = torch.arange(0, num_frames).unsqueeze(0).expand(batch_size, -1)
-
-        hidden_states = self.x_embedder(hidden_states)
-        image_rotary_emb = self.rope(
-            frame_indices=indices_latents, height=height, width=width, device=hidden_states.device
-        )
-
-        latents_clean, latents_history_2x, latents_history_4x = self.clean_x_embedder(
-            latents_clean, latents_history_2x, latents_history_4x
-        )
-
-        if latents_clean is not None and indices_latents_clean is not None:
-            image_rotary_emb_clean = self.rope(
-                frame_indices=indices_latents_clean, height=height, width=width, device=hidden_states.device
-            )
-        if latents_history_2x is not None and indices_latents_history_2x is not None:
-            image_rotary_emb_history_2x = self.rope(
-                frame_indices=indices_latents_history_2x, height=height, width=width, device=hidden_states.device
-            )
-        if latents_history_4x is not None and indices_latents_history_4x is not None:
-            image_rotary_emb_history_4x = self.rope(
-                frame_indices=indices_latents_history_4x, height=height, width=width, device=hidden_states.device
-            )
-
-        hidden_states, image_rotary_emb = self._pack_history_states(
-            hidden_states,
-            latents_clean,
-            latents_history_2x,
-            latents_history_4x,
-            image_rotary_emb,
-            image_rotary_emb_clean,
-            image_rotary_emb_history_2x,
-            image_rotary_emb_history_4x,
-            post_patch_height,
-            post_patch_width,
-        )
-
-        temb, _ = self.time_text_embed(timestep, pooled_projections, guidance)
-        encoder_hidden_states = self.context_embedder(encoder_hidden_states, timestep, encoder_attention_mask)
-
-        encoder_hidden_states_image = self.image_projection(image_embeds)
-        attention_mask_image = encoder_attention_mask.new_ones((batch_size, encoder_hidden_states_image.shape[1]))
-
-        # must cat before (not after) encoder_hidden_states, due to attn masking
-        encoder_hidden_states = torch.cat([encoder_hidden_states_image, encoder_hidden_states], dim=1)
-        encoder_attention_mask = torch.cat([attention_mask_image, encoder_attention_mask], dim=1)
-
-        latent_sequence_length = hidden_states.shape[1]
-        condition_sequence_length = encoder_hidden_states.shape[1]
-        sequence_length = latent_sequence_length + condition_sequence_length
-        attention_mask = torch.zeros(
-            batch_size, sequence_length, device=hidden_states.device, dtype=torch.bool
-        )  # [B, N]
-        effective_condition_sequence_length = encoder_attention_mask.sum(dim=1, dtype=torch.int)  # [B,]
-        effective_sequence_length = latent_sequence_length + effective_condition_sequence_length
-
-        if batch_size == 1:
-            encoder_hidden_states = encoder_hidden_states[:, : effective_condition_sequence_length[0]]
-            attention_mask = None
-        else:
-            for i in range(batch_size):
-                attention_mask[i, : effective_sequence_length[i]] = True
-            # [B, 1, 1, N], for broadcasting across attention heads
-            attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
-
-        if torch.is_grad_enabled() and self.gradient_checkpointing:
-            for block in self.transformer_blocks:
-                hidden_states, encoder_hidden_states = self._gradient_checkpointing_func(
-                    block, hidden_states, encoder_hidden_states, temb, attention_mask, image_rotary_emb
-                )
-
-            for block in self.single_transformer_blocks:
-                hidden_states, encoder_hidden_states = self._gradient_checkpointing_func(
-                    block, hidden_states, encoder_hidden_states, temb, attention_mask, image_rotary_emb
-                )
-
-        else:
-            for block in self.transformer_blocks:
-                hidden_states, encoder_hidden_states = block(
-                    hidden_states, encoder_hidden_states, temb, attention_mask, image_rotary_emb
-                )
-
-            for block in self.single_transformer_blocks:
-                hidden_states, encoder_hidden_states = block(
-                    hidden_states, encoder_hidden_states, temb, attention_mask, image_rotary_emb
-                )
-
-        hidden_states = hidden_states[:, -original_context_length:]
-        hidden_states = self.norm_out(hidden_states, temb)
-        hidden_states = self.proj_out(hidden_states)
-
-        hidden_states = hidden_states.reshape(
-            batch_size, post_patch_num_frames, post_patch_height, post_patch_width, -1, p_t, p, p
-        )
-        hidden_states = hidden_states.permute(0, 4, 1, 5, 2, 6, 3, 7)
-        hidden_states = hidden_states.flatten(6, 7).flatten(4, 5).flatten(2, 3)
-
-        if USE_PEFT_BACKEND:
-            # remove `lora_scale` from each PEFT layer
-            unscale_lora_layers(self, lora_scale)
-
-        if not return_dict:
-            return (hidden_states,)
-        return Transformer2DModelOutput(sample=hidden_states)
-
-    def _pack_history_states(
-        self,
-        hidden_states: torch.Tensor,
-        latents_clean: Optional[torch.Tensor] = None,
-        latents_history_2x: Optional[torch.Tensor] = None,
-        latents_history_4x: Optional[torch.Tensor] = None,
-        image_rotary_emb: Tuple[torch.Tensor, torch.Tensor] = None,
-        image_rotary_emb_clean: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        image_rotary_emb_history_2x: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        image_rotary_emb_history_4x: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-        height: int = None,
-        width: int = None,
-    ):
-        image_rotary_emb = list(image_rotary_emb)  # convert tuple to list for in-place modification
-
-        if latents_clean is not None and image_rotary_emb_clean is not None:
-            hidden_states = torch.cat([latents_clean, hidden_states], dim=1)
-            image_rotary_emb[0] = torch.cat([image_rotary_emb_clean[0], image_rotary_emb[0]], dim=0)
-            image_rotary_emb[1] = torch.cat([image_rotary_emb_clean[1], image_rotary_emb[1]], dim=0)
-
-        if latents_history_2x is not None and image_rotary_emb_history_2x is not None:
-            hidden_states = torch.cat([latents_history_2x, hidden_states], dim=1)
-            image_rotary_emb_history_2x = self._pad_rotary_emb(image_rotary_emb_history_2x, height, width, (2, 2, 2))
-            image_rotary_emb[0] = torch.cat([image_rotary_emb_history_2x[0], image_rotary_emb[0]], dim=0)
-            image_rotary_emb[1] = torch.cat([image_rotary_emb_history_2x[1], image_rotary_emb[1]], dim=0)
-
-        if latents_history_4x is not None and image_rotary_emb_history_4x is not None:
-            hidden_states = torch.cat([latents_history_4x, hidden_states], dim=1)
-            image_rotary_emb_history_4x = self._pad_rotary_emb(image_rotary_emb_history_4x, height, width, (4, 4, 4))
-            image_rotary_emb[0] = torch.cat([image_rotary_emb_history_4x[0], image_rotary_emb[0]], dim=0)
-            image_rotary_emb[1] = torch.cat([image_rotary_emb_history_4x[1], image_rotary_emb[1]], dim=0)
-
-        return hidden_states, tuple(image_rotary_emb)
-
-    def _pad_rotary_emb(
-        self,
-        image_rotary_emb: Tuple[torch.Tensor],
-        height: int,
-        width: int,
-        kernel_size: Tuple[int, int, int],
-    ):
-        # freqs_cos, freqs_sin have shape [W * H * T, D / 2], where D is attention head dim
-        freqs_cos, freqs_sin = image_rotary_emb
-        freqs_cos = freqs_cos.unsqueeze(0).permute(0, 2, 1).unflatten(2, (-1, height, width))
-        freqs_sin = freqs_sin.unsqueeze(0).permute(0, 2, 1).unflatten(2, (-1, height, width))
-        freqs_cos = _pad_for_3d_conv(freqs_cos, kernel_size)
-        freqs_sin = _pad_for_3d_conv(freqs_sin, kernel_size)
-        freqs_cos = _center_down_sample_3d(freqs_cos, kernel_size)
-        freqs_sin = _center_down_sample_3d(freqs_sin, kernel_size)
-        freqs_cos = freqs_cos.flatten(2).permute(0, 2, 1).squeeze(0)
-        freqs_sin = freqs_sin.flatten(2).permute(0, 2, 1).squeeze(0)
-        return freqs_cos, freqs_sin
-
-
-def _pad_for_3d_conv(x, kernel_size):
-    if isinstance(x, (tuple, list)):
-        return tuple(_pad_for_3d_conv(i, kernel_size) for i in x)
-    b, c, t, h, w = x.shape
-    pt, ph, pw = kernel_size
-    pad_t = (pt - (t % pt)) % pt
-    pad_h = (ph - (h % ph)) % ph
-    pad_w = (pw - (w % pw)) % pw
-    return torch.nn.functional.pad(x, (0, pad_w, 0, pad_h, 0, pad_t), mode="replicate")
-
-
-def _center_down_sample_3d(x, kernel_size):
-    if isinstance(x, (tuple, list)):
-        return tuple(_center_down_sample_3d(i, kernel_size) for i in x)
-    return torch.nn.functional.avg_pool3d(x, kernel_size, stride=kernel_size)
@@ -156,8 +156,6 @@ else:
    ]
    _import_structure["cogview3"] = ["CogView3PlusPipeline"]
    _import_structure["cogview4"] = ["CogView4Pipeline", "CogView4ControlPipeline"]
-    _import_structure["consisid"] = ["ConsisIDPipeline"]
-    _import_structure["cosmos"] = ["CosmosTextToWorldPipeline", "CosmosVideoToWorldPipeline"]
    _import_structure["controlnet"].extend(
        [
            "BlipDiffusionControlNetPipeline",
@@ -229,7 +227,6 @@ else:
        "HunyuanVideoPipeline",
        "HunyuanSkyreelsImageToVideoPipeline",
        "HunyuanVideoImageToVideoPipeline",
-        "HunyuanVideoFramepackPipeline",
    ]
    _import_structure["kandinsky"] = [
        "KandinskyCombinedPipeline",
@@ -548,7 +545,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            StableDiffusionControlNetXSPipeline,
            StableDiffusionXLControlNetXSPipeline,
        )
-        from .cosmos import CosmosTextToWorldPipeline, CosmosVideoToWorldPipeline
        from .deepfloyd_if import (
            IFImg2ImgPipeline,
            IFImg2ImgSuperResolutionPipeline,
@@ -593,7 +589,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .hidream_image import HiDreamImagePipeline
        from .hunyuan_video import (
            HunyuanSkyreelsImageToVideoPipeline,
-            HunyuanVideoFramepackPipeline,
            HunyuanVideoImageToVideoPipeline,
            HunyuanVideoPipeline,
        )
@@ -40,7 +40,6 @@ from ...utils import (
    logging,
    replace_example_docstring,
 )
-from ...utils.import_utils import is_transformers_version
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import AudioPipelineOutput, DiffusionPipeline
 from .modeling_audioldm2 import AudioLDM2ProjectionModel, AudioLDM2UNet2DConditionModel
@@ -313,19 +312,8 @@ class AudioLDM2Pipeline(DiffusionPipeline):
            `inputs_embeds (`torch.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
                The sequence of generated hidden-states.
        """
-        cache_position_kwargs = {}
-        if is_transformers_version("<", "4.52.0.dev0"):
-            cache_position_kwargs["input_ids"] = inputs_embeds
-            cache_position_kwargs["model_kwargs"] = model_kwargs
-        else:
-            cache_position_kwargs["seq_length"] = inputs_embeds.shape[0]
-            cache_position_kwargs["device"] = (
-                self.language_model.device if getattr(self, "language_model", None) is not None else self.device
-            )
-            cache_position_kwargs["model_kwargs"] = model_kwargs
        max_new_tokens = max_new_tokens if max_new_tokens is not None else self.language_model.config.max_new_tokens
-        model_kwargs = self.language_model._get_initial_cache_position(**cache_position_kwargs)
-
+        model_kwargs = self.language_model._get_initial_cache_position(inputs_embeds, model_kwargs)
        for _ in range(max_new_tokens):
            # prepare model inputs
            model_inputs = prepare_inputs_for_generation(inputs_embeds, **model_kwargs)
@@ -1,50 +0,0 @@
-from typing import TYPE_CHECKING
-
-from ...utils import (
-    DIFFUSERS_SLOW_IMPORT,
-    OptionalDependencyNotAvailable,
-    _LazyModule,
-    get_objects_from_module,
-    is_torch_available,
-    is_transformers_available,
-)
-
-
-_dummy_objects = {}
-_import_structure = {}
-
-
-try:
-    if not (is_transformers_available() and is_torch_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
-
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
-else:
-    _import_structure["pipeline_cosmos_text2world"] = ["CosmosTextToWorldPipeline"]
-    _import_structure["pipeline_cosmos_video2world"] = ["CosmosVideoToWorldPipeline"]
-
-if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
-    try:
-        if not (is_transformers_available() and is_torch_available()):
-            raise OptionalDependencyNotAvailable()
-
-    except OptionalDependencyNotAvailable:
-        from ...utils.dummy_torch_and_transformers_objects import *
-    else:
-        from .pipeline_cosmos_text2world import CosmosTextToWorldPipeline
-        from .pipeline_cosmos_video2world import CosmosVideoToWorldPipeline
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(
-        __name__,
-        globals()["__file__"],
-        _import_structure,
-        module_spec=__spec__,
-    )
-
-    for name, value in _dummy_objects.items():
-        setattr(sys.modules[__name__], name, value)
@@ -1,667 +0,0 @@
-# Copyright 2024 The NVIDIA Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, Dict, List, Optional, Union
-
-import numpy as np
-import torch
-from transformers import T5EncoderModel, T5TokenizerFast
-
-from ...callbacks import MultiPipelineCallbacks, PipelineCallback
-from ...models import AutoencoderKLCosmos, CosmosTransformer3DModel
-from ...schedulers import EDMEulerScheduler
-from ...utils import is_cosmos_guardrail_available, is_torch_xla_available, logging, replace_example_docstring
-from ...utils.torch_utils import randn_tensor
-from ...video_processor import VideoProcessor
-from ..pipeline_utils import DiffusionPipeline
-from .pipeline_output import CosmosPipelineOutput
-
-
-if is_cosmos_guardrail_available():
-    from cosmos_guardrail import CosmosSafetyChecker
-else:
-
-    class CosmosSafetyChecker:
-        def __init__(self, *args, **kwargs):
-            raise ImportError(
-                "`cosmos_guardrail` is not installed. Please install it to use the safety checker for Cosmos: `pip install cosmos_guardrail`."
-            )
-
-
-if is_torch_xla_available():
-    import torch_xla.core.xla_model as xm
-
-    XLA_AVAILABLE = True
-else:
-    XLA_AVAILABLE = False
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        ```python
-        >>> import torch
-        >>> from diffusers import CosmosTextToWorldPipeline
-        >>> from diffusers.utils import export_to_video
-
-        >>> model_id = "nvidia/Cosmos-1.0-Diffusion-7B-Text2World"
-        >>> pipe = CosmosTextToWorldPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
-        >>> pipe.to("cuda")
-
-        >>> prompt = "A sleek, humanoid robot stands in a vast warehouse filled with neatly stacked cardboard boxes on industrial shelves. The robot's metallic body gleams under the bright, even lighting, highlighting its futuristic design and intricate joints. A glowing blue light emanates from its chest, adding a touch of advanced technology. The background is dominated by rows of boxes, suggesting a highly organized storage system. The floor is lined with wooden pallets, enhancing the industrial setting. The camera remains static, capturing the robot's poised stance amidst the orderly environment, with a shallow depth of field that keeps the focus on the robot while subtly blurring the background for a cinematic effect."
-
-        >>> output = pipe(prompt=prompt).frames[0]
-        >>> export_to_video(output, "output.mp4", fps=30)
-        ```
-"""
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    sigmas: Optional[List[float]] = None,
-    **kwargs,
-):
-    r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Args:
-        scheduler (`SchedulerMixin`):
-            The scheduler to get timesteps from.
-        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            must be `None`.
-        device (`str` or `torch.device`, *optional*):
-            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
-            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
-            `num_inference_steps` and `sigmas` must be `None`.
-        sigmas (`List[float]`, *optional*):
-            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
-            `num_inference_steps` and `timesteps` must be `None`.
-
-    Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
-    """
-    if timesteps is not None and sigmas is not None:
-        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-    if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accept_sigmas:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" sigmas schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
-
-
-class CosmosTextToWorldPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for text-to-video generation using [Cosmos](https://github.com/NVIDIA/Cosmos).
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
-    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
-
-    Args:
-        text_encoder ([`T5EncoderModel`]):
-            Frozen text-encoder. Cosmos uses
-            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
-            [t5-11b](https://huggingface.co/google-t5/t5-11b) variant.
-        tokenizer (`T5TokenizerFast`):
-            Tokenizer of class
-            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
-        transformer ([`CosmosTransformer3DModel`]):
-            Conditional Transformer to denoise the encoded image latents.
-        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
-            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
-        vae ([`AutoencoderKLCosmos`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
-    """
-
-    model_cpu_offload_seq = "text_encoder->transformer->vae"
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
-    # We mark safety_checker as optional here to get around some test failures, but it is not really optional
-    _optional_components = ["safety_checker"]
-
-    def __init__(
-        self,
-        text_encoder: T5EncoderModel,
-        tokenizer: T5TokenizerFast,
-        transformer: CosmosTransformer3DModel,
-        vae: AutoencoderKLCosmos,
-        scheduler: EDMEulerScheduler,
-        safety_checker: CosmosSafetyChecker = None,
-    ):
-        super().__init__()
-
-        if safety_checker is None:
-            safety_checker = CosmosSafetyChecker()
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            transformer=transformer,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-        )
-
-        self.vae_scale_factor_temporal = (
-            self.vae.config.temporal_compression_ratio if getattr(self, "vae", None) else 8
-        )
-        self.vae_scale_factor_spatial = self.vae.config.spatial_compression_ratio if getattr(self, "vae", None) else 8
-        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
-
-    def _get_t5_prompt_embeds(
-        self,
-        prompt: Union[str, List[str]] = None,
-        max_sequence_length: int = 512,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
-    ):
-        device = device or self._execution_device
-        dtype = dtype or self.text_encoder.dtype
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=max_sequence_length,
-            truncation=True,
-            return_tensors="pt",
-            return_length=True,
-            return_offsets_mapping=False,
-        )
-        text_input_ids = text_inputs.input_ids
-        prompt_attention_mask = text_inputs.attention_mask.bool().to(device)
-
-        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
-            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
-            logger.warning(
-                "The following part of your input was truncated because `max_sequence_length` is set to "
-                f" {max_sequence_length} tokens: {removed_text}"
-            )
-
-        prompt_embeds = self.text_encoder(
-            text_input_ids.to(device), attention_mask=prompt_attention_mask
-        ).last_hidden_state
-        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
-
-        lengths = prompt_attention_mask.sum(dim=1).cpu()
-        for i, length in enumerate(lengths):
-            prompt_embeds[i, length:] = 0
-
-        return prompt_embeds
-
-    def encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        do_classifier_free_guidance: bool = True,
-        num_videos_per_prompt: int = 1,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        max_sequence_length: int = 512,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
-                Whether to use classifier free guidance or not.
-            num_videos_per_prompt (`int`, *optional*, defaults to 1):
-                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
-            prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            device: (`torch.device`, *optional*):
-                torch device
-            dtype: (`torch.dtype`, *optional*):
-                torch dtype
-        """
-        device = device or self._execution_device
-
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-        if prompt is not None:
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            prompt_embeds = self._get_t5_prompt_embeds(
-                prompt=prompt, max_sequence_length=max_sequence_length, device=device, dtype=dtype
-            )
-
-            # duplicate text embeddings for each generation per prompt, using mps friendly method
-            _, seq_len, _ = prompt_embeds.shape
-            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
-            prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
-
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            negative_prompt = negative_prompt or ""
-            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-
-            if prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-
-            negative_prompt_embeds = self._get_t5_prompt_embeds(
-                prompt=negative_prompt, max_sequence_length=max_sequence_length, device=device, dtype=dtype
-            )
-
-            # duplicate text embeddings for each generation per prompt, using mps friendly method
-            _, seq_len, _ = negative_prompt_embeds.shape
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
-
-        return prompt_embeds, negative_prompt_embeds
-
-    def prepare_latents(
-        self,
-        batch_size: int,
-        num_channels_latents: 16,
-        height: int = 704,
-        width: int = 1280,
-        num_frames: int = 121,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if latents is not None:
-            return latents.to(device=device, dtype=dtype) * self.scheduler.config.sigma_max
-
-        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
-        latent_height = height // self.vae_scale_factor_spatial
-        latent_width = width // self.vae_scale_factor_spatial
-        shape = (batch_size, num_channels_latents, num_latent_frames, latent_height, latent_width)
-
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        return latents * self.scheduler.config.sigma_max
-
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        prompt_embeds=None,
-        callback_on_step_end_tensor_inputs=None,
-    ):
-        if height % 16 != 0 or width % 16 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
-
-        if callback_on_step_end_tensor_inputs is not None and not all(
-            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
-        ):
-            raise ValueError(
-                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    @property
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1.0
-
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
-
-    @property
-    def current_timestep(self):
-        return self._current_timestep
-
-    @property
-    def interrupt(self):
-        return self._interrupt
-
-    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        prompt: Union[str, List[str]] = None,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        height: int = 704,
-        width: int = 1280,
-        num_frames: int = 121,
-        num_inference_steps: int = 36,
-        guidance_scale: float = 7.0,
-        fps: int = 30,
-        num_videos_per_prompt: Optional[int] = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback_on_step_end: Optional[
-            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
-        ] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        max_sequence_length: int = 512,
-    ):
-        r"""
-        The call function to the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, defaults to `720`):
-                The height in pixels of the generated image.
-            width (`int`, defaults to `1280`):
-                The width in pixels of the generated image.
-            num_frames (`int`, defaults to `129`):
-                The number of frames in the generated video.
-            num_inference_steps (`int`, defaults to `50`):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, defaults to `6.0`):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`.
-            fps (`int`, defaults to `30`):
-                The frames per second of the generated video.
-            num_videos_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
-                generation deterministic.
-            latents (`torch.Tensor`, *optional*):
-                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor is generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
-                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`CosmosPipelineOutput`] instead of a plain tuple.
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
-                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
-                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
-                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
-                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-
-        Examples:
-
-        Returns:
-            [`~CosmosPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`CosmosPipelineOutput`] is returned, otherwise a `tuple` is returned where
-                the first element is a list with the generated images and the second element is a list of `bool`s
-                indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
-        """
-
-        if self.safety_checker is None:
-            raise ValueError(
-                f"You have disabled the safety checker for {self.__class__}. This is in violation of the "
-                "[NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). "
-                f"Please ensure that you are compliant with the license agreement."
-            )
-
-        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
-            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, height, width, prompt_embeds, callback_on_step_end_tensor_inputs)
-
-        self._guidance_scale = guidance_scale
-        self._current_timestep = None
-        self._interrupt = False
-
-        device = self._execution_device
-
-        if self.safety_checker is not None:
-            self.safety_checker.to(device)
-            if prompt is not None:
-                prompt_list = [prompt] if isinstance(prompt, str) else prompt
-                for p in prompt_list:
-                    if not self.safety_checker.check_text_safety(p):
-                        raise ValueError(
-                            f"Cosmos Guardrail detected unsafe text in the prompt: {p}. Please ensure that the "
-                            f"prompt abides by the NVIDIA Open Model License Agreement."
-                        )
-            self.safety_checker.to("cpu")
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # 3. Encode input prompt
-        (
-            prompt_embeds,
-            negative_prompt_embeds,
-        ) = self.encode_prompt(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            do_classifier_free_guidance=self.do_classifier_free_guidance,
-            num_videos_per_prompt=num_videos_per_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            device=device,
-            max_sequence_length=max_sequence_length,
-        )
-
-        # 4. Prepare timesteps
-        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device)
-
-        # 5. Prepare latent variables
-        transformer_dtype = self.transformer.dtype
-        num_channels_latents = self.transformer.config.in_channels
-        latents = self.prepare_latents(
-            batch_size * num_videos_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            num_frames,
-            torch.float32,
-            device,
-            generator,
-            latents,
-        )
-
-        padding_mask = latents.new_zeros(1, 1, height, width, dtype=transformer_dtype)
-
-        # 6. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        self._num_timesteps = len(timesteps)
-
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-
-                self._current_timestep = t
-                timestep = t.expand(latents.shape[0]).to(transformer_dtype)
-
-                latent_model_input = latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
-                latent_model_input = latent_model_input.to(transformer_dtype)
-
-                noise_pred = self.transformer(
-                    hidden_states=latent_model_input,
-                    timestep=timestep,
-                    encoder_hidden_states=prompt_embeds,
-                    fps=fps,
-                    padding_mask=padding_mask,
-                    return_dict=False,
-                )[0]
-
-                sample = latents
-                if self.do_classifier_free_guidance:
-                    noise_pred_uncond = self.transformer(
-                        hidden_states=latent_model_input,
-                        timestep=timestep,
-                        encoder_hidden_states=negative_prompt_embeds,
-                        fps=fps,
-                        padding_mask=padding_mask,
-                        return_dict=False,
-                    )[0]
-                    noise_pred = torch.cat([noise_pred_uncond, noise_pred])
-                    sample = torch.cat([sample, sample])
-
-                # pred_original_sample (x0)
-                noise_pred = self.scheduler.step(noise_pred, t, sample, return_dict=False)[1]
-                self.scheduler._step_index -= 1
-
-                if self.do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2)
-                    noise_pred = noise_pred_cond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
-
-                # pred_sample (eps)
-                latents = self.scheduler.step(
-                    noise_pred, t, latents, return_dict=False, pred_original_sample=noise_pred
-                )[0]
-
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-
-                if XLA_AVAILABLE:
-                    xm.mark_step()
-
-        self._current_timestep = None
-
-        if not output_type == "latent":
-            if self.vae.config.latents_mean is not None:
-                latents_mean, latents_std = self.vae.config.latents_mean, self.vae.config.latents_std
-                latents_mean = (
-                    torch.tensor(latents_mean)
-                    .view(1, self.vae.config.latent_channels, -1, 1, 1)[:, :, : latents.size(2)]
-                    .to(latents)
-                )
-                latents_std = (
-                    torch.tensor(latents_std)
-                    .view(1, self.vae.config.latent_channels, -1, 1, 1)[:, :, : latents.size(2)]
-                    .to(latents)
-                )
-                latents = latents * latents_std / self.scheduler.config.sigma_data + latents_mean
-            else:
-                latents = latents / self.scheduler.config.sigma_data
-            video = self.vae.decode(latents.to(self.vae.dtype), return_dict=False)[0]
-
-            if self.safety_checker is not None:
-                self.safety_checker.to(device)
-                video = self.video_processor.postprocess_video(video, output_type="np")
-                video = (video * 255).astype(np.uint8)
-                video_batch = []
-                for vid in video:
-                    vid = self.safety_checker.check_video_safety(vid)
-                    video_batch.append(vid)
-                video = np.stack(video_batch).astype(np.float32) / 255.0 * 2 - 1
-                video = torch.from_numpy(video).permute(0, 4, 1, 2, 3)
-                video = self.video_processor.postprocess_video(video, output_type=output_type)
-                self.safety_checker.to("cpu")
-            else:
-                video = self.video_processor.postprocess_video(video, output_type=output_type)
-        else:
-            video = latents
-
-        # Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (video,)
-
-        return CosmosPipelineOutput(frames=video)
@@ -1,828 +0,0 @@
-# Copyright 2024 The NVIDIA Team and The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-from typing import Callable, Dict, List, Optional, Union
-
-import numpy as np
-import torch
-from transformers import T5EncoderModel, T5TokenizerFast
-
-from ...callbacks import MultiPipelineCallbacks, PipelineCallback
-from ...image_processor import PipelineImageInput
-from ...models import AutoencoderKLCosmos, CosmosTransformer3DModel
-from ...schedulers import EDMEulerScheduler
-from ...utils import is_cosmos_guardrail_available, is_torch_xla_available, logging, replace_example_docstring
-from ...utils.torch_utils import randn_tensor
-from ...video_processor import VideoProcessor
-from ..pipeline_utils import DiffusionPipeline
-from .pipeline_output import CosmosPipelineOutput
-
-
-if is_cosmos_guardrail_available():
-    from cosmos_guardrail import CosmosSafetyChecker
-else:
-
-    class CosmosSafetyChecker:
-        def __init__(self, *args, **kwargs):
-            raise ImportError(
-                "`cosmos_guardrail` is not installed. Please install it to use the safety checker for Cosmos: `pip install cosmos_guardrail`."
-            )
-
-
-if is_torch_xla_available():
-    import torch_xla.core.xla_model as xm
-
-    XLA_AVAILABLE = True
-else:
-    XLA_AVAILABLE = False
-
-logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
-
-
-EXAMPLE_DOC_STRING = """
-    Examples:
-        Image conditioning:
-
-        ```python
-        >>> import torch
-        >>> from diffusers import CosmosVideoToWorldPipeline
-        >>> from diffusers.utils import export_to_video, load_image
-
-        >>> model_id = "nvidia/Cosmos-1.0-Diffusion-7B-Video2World"
-        >>> pipe = CosmosVideoToWorldPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
-        >>> pipe.to("cuda")
-
-        >>> prompt = "The video depicts a long, straight highway stretching into the distance, flanked by metal guardrails. The road is divided into multiple lanes, with a few vehicles visible in the far distance. The surrounding landscape features dry, grassy fields on one side and rolling hills on the other. The sky is mostly clear with a few scattered clouds, suggesting a bright, sunny day."
-        >>> image = load_image(
-        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input.jpg"
-        ... )
-
-        >>> video = pipe(image=image, prompt=prompt).frames[0]
-        >>> export_to_video(video, "output.mp4", fps=30)
-        ```
-
-        Video conditioning:
-
-        ```python
-        >>> import torch
-        >>> from diffusers import CosmosVideoToWorldPipeline
-        >>> from diffusers.utils import export_to_video, load_video
-
-        >>> model_id = "nvidia/Cosmos-1.0-Diffusion-7B-Video2World"
-        >>> pipe = CosmosVideoToWorldPipeline.from_pretrained(model_id, torch_dtype=torch.bfloat16)
-        >>> pipe.transformer = torch.compile(pipe.transformer)
-        >>> pipe.to("cuda")
-
-        >>> prompt = "The video depicts a winding mountain road covered in snow, with a single vehicle traveling along it. The road is flanked by steep, rocky cliffs and sparse vegetation. The landscape is characterized by rugged terrain and a river visible in the distance. The scene captures the solitude and beauty of a winter drive through a mountainous region."
-        >>> video = load_video(
-        ...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cosmos/cosmos-video2world-input-vid.mp4"
-        ... )[
-        ...     :21
-        ... ]  # This example uses only the first 21 frames
-
-        >>> video = pipe(video=video, prompt=prompt).frames[0]
-        >>> export_to_video(video, "output.mp4", fps=30)
-        ```
-"""
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
-def retrieve_timesteps(
-    scheduler,
-    num_inference_steps: Optional[int] = None,
-    device: Optional[Union[str, torch.device]] = None,
-    timesteps: Optional[List[int]] = None,
-    sigmas: Optional[List[float]] = None,
-    **kwargs,
-):
-    r"""
-    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
-    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
-
-    Args:
-        scheduler (`SchedulerMixin`):
-            The scheduler to get timesteps from.
-        num_inference_steps (`int`):
-            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
-            must be `None`.
-        device (`str` or `torch.device`, *optional*):
-            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
-        timesteps (`List[int]`, *optional*):
-            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
-            `num_inference_steps` and `sigmas` must be `None`.
-        sigmas (`List[float]`, *optional*):
-            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
-            `num_inference_steps` and `timesteps` must be `None`.
-
-    Returns:
-        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
-        second element is the number of inference steps.
-    """
-    if timesteps is not None and sigmas is not None:
-        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
-    if timesteps is not None:
-        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accepts_timesteps:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" timestep schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    elif sigmas is not None:
-        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
-        if not accept_sigmas:
-            raise ValueError(
-                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
-                f" sigmas schedules. Please check whether you are using the correct scheduler."
-            )
-        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-        num_inference_steps = len(timesteps)
-    else:
-        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
-        timesteps = scheduler.timesteps
-    return timesteps, num_inference_steps
-
-
-# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
-def retrieve_latents(
-    encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
-):
-    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
-        return encoder_output.latent_dist.sample(generator)
-    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
-        return encoder_output.latent_dist.mode()
-    elif hasattr(encoder_output, "latents"):
-        return encoder_output.latents
-    else:
-        raise AttributeError("Could not access latents of provided encoder_output")
-
-
-class CosmosVideoToWorldPipeline(DiffusionPipeline):
-    r"""
-    Pipeline for image-to-video and video-to-video generation using [Cosmos](https://github.com/NVIDIA/Cosmos).
-
-    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods
-    implemented for all pipelines (downloading, saving, running on a particular device, etc.).
-
-    Args:
-        text_encoder ([`T5EncoderModel`]):
-            Frozen text-encoder. Cosmos uses
-            [T5](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5EncoderModel); specifically the
-            [t5-11b](https://huggingface.co/google-t5/t5-11b) variant.
-        tokenizer (`T5TokenizerFast`):
-            Tokenizer of class
-            [T5Tokenizer](https://huggingface.co/docs/transformers/model_doc/t5#transformers.T5Tokenizer).
-        transformer ([`CosmosTransformer3DModel`]):
-            Conditional Transformer to denoise the encoded image latents.
-        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
-            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
-        vae ([`AutoencoderKLCosmos`]):
-            Variational Auto-Encoder (VAE) Model to encode and decode videos to and from latent representations.
-    """
-
-    model_cpu_offload_seq = "text_encoder->transformer->vae"
-    _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"]
-    # We mark safety_checker as optional here to get around some test failures, but it is not really optional
-    _optional_components = ["safety_checker"]
-
-    def __init__(
-        self,
-        text_encoder: T5EncoderModel,
-        tokenizer: T5TokenizerFast,
-        transformer: CosmosTransformer3DModel,
-        vae: AutoencoderKLCosmos,
-        scheduler: EDMEulerScheduler,
-        safety_checker: CosmosSafetyChecker = None,
-    ):
-        super().__init__()
-
-        if safety_checker is None:
-            safety_checker = CosmosSafetyChecker()
-
-        self.register_modules(
-            vae=vae,
-            text_encoder=text_encoder,
-            tokenizer=tokenizer,
-            transformer=transformer,
-            scheduler=scheduler,
-            safety_checker=safety_checker,
-        )
-
-        self.vae_scale_factor_temporal = (
-            self.vae.config.temporal_compression_ratio if getattr(self, "vae", None) else 8
-        )
-        self.vae_scale_factor_spatial = self.vae.config.spatial_compression_ratio if getattr(self, "vae", None) else 8
-        self.video_processor = VideoProcessor(vae_scale_factor=self.vae_scale_factor_spatial)
-
-    # Copied from diffusers.pipelines.cosmos.pipeline_cosmos_text2world.CosmosTextToWorldPipeline._get_t5_prompt_embeds
-    def _get_t5_prompt_embeds(
-        self,
-        prompt: Union[str, List[str]] = None,
-        max_sequence_length: int = 512,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
-    ):
-        device = device or self._execution_device
-        dtype = dtype or self.text_encoder.dtype
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-
-        text_inputs = self.tokenizer(
-            prompt,
-            padding="max_length",
-            max_length=max_sequence_length,
-            truncation=True,
-            return_tensors="pt",
-            return_length=True,
-            return_offsets_mapping=False,
-        )
-        text_input_ids = text_inputs.input_ids
-        prompt_attention_mask = text_inputs.attention_mask.bool().to(device)
-
-        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
-        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
-            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, max_sequence_length - 1 : -1])
-            logger.warning(
-                "The following part of your input was truncated because `max_sequence_length` is set to "
-                f" {max_sequence_length} tokens: {removed_text}"
-            )
-
-        prompt_embeds = self.text_encoder(
-            text_input_ids.to(device), attention_mask=prompt_attention_mask
-        ).last_hidden_state
-        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
-
-        lengths = prompt_attention_mask.sum(dim=1).cpu()
-        for i, length in enumerate(lengths):
-            prompt_embeds[i, length:] = 0
-
-        return prompt_embeds
-
-    # Copied from diffusers.pipelines.cosmos.pipeline_cosmos_text2world.CosmosTextToWorldPipeline.encode_prompt
-    def encode_prompt(
-        self,
-        prompt: Union[str, List[str]],
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        do_classifier_free_guidance: bool = True,
-        num_videos_per_prompt: int = 1,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        max_sequence_length: int = 512,
-        device: Optional[torch.device] = None,
-        dtype: Optional[torch.dtype] = None,
-    ):
-        r"""
-        Encodes the prompt into text encoder hidden states.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                prompt to be encoded
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            do_classifier_free_guidance (`bool`, *optional*, defaults to `True`):
-                Whether to use classifier free guidance or not.
-            num_videos_per_prompt (`int`, *optional*, defaults to 1):
-                Number of videos that should be generated per prompt. torch device to place the resulting embeddings on
-            prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            device: (`torch.device`, *optional*):
-                torch device
-            dtype: (`torch.dtype`, *optional*):
-                torch dtype
-        """
-        device = device or self._execution_device
-
-        prompt = [prompt] if isinstance(prompt, str) else prompt
-        if prompt is not None:
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        if prompt_embeds is None:
-            prompt_embeds = self._get_t5_prompt_embeds(
-                prompt=prompt, max_sequence_length=max_sequence_length, device=device, dtype=dtype
-            )
-
-            # duplicate text embeddings for each generation per prompt, using mps friendly method
-            _, seq_len, _ = prompt_embeds.shape
-            prompt_embeds = prompt_embeds.repeat(1, num_videos_per_prompt, 1)
-            prompt_embeds = prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
-
-        if do_classifier_free_guidance and negative_prompt_embeds is None:
-            negative_prompt = negative_prompt or ""
-            negative_prompt = batch_size * [negative_prompt] if isinstance(negative_prompt, str) else negative_prompt
-
-            if prompt is not None and type(prompt) is not type(negative_prompt):
-                raise TypeError(
-                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
-                    f" {type(prompt)}."
-                )
-            elif batch_size != len(negative_prompt):
-                raise ValueError(
-                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
-                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
-                    " the batch size of `prompt`."
-                )
-
-            negative_prompt_embeds = self._get_t5_prompt_embeds(
-                prompt=negative_prompt, max_sequence_length=max_sequence_length, device=device, dtype=dtype
-            )
-
-            # duplicate text embeddings for each generation per prompt, using mps friendly method
-            _, seq_len, _ = negative_prompt_embeds.shape
-            negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_videos_per_prompt, 1)
-            negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_videos_per_prompt, seq_len, -1)
-
-        return prompt_embeds, negative_prompt_embeds
-
-    def prepare_latents(
-        self,
-        video: torch.Tensor,
-        batch_size: int,
-        num_channels_latents: 16,
-        height: int = 704,
-        width: int = 1280,
-        num_frames: int = 121,
-        do_classifier_free_guidance: bool = True,
-        input_frames_guidance: bool = False,
-        dtype: Optional[torch.dtype] = None,
-        device: Optional[torch.device] = None,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        if isinstance(generator, list) and len(generator) != batch_size:
-            raise ValueError(
-                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
-                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
-            )
-
-        num_cond_frames = video.size(2)
-        if num_cond_frames >= num_frames:
-            # Take the last `num_frames` frames for conditioning
-            num_cond_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
-            video = video[:, :, -num_frames:]
-        else:
-            num_cond_latent_frames = (num_cond_frames - 1) // self.vae_scale_factor_temporal + 1
-            num_padding_frames = num_frames - num_cond_frames
-            padding = video.new_zeros(video.size(0), video.size(1), num_padding_frames, video.size(3), video.size(4))
-            video = torch.cat([video, padding], dim=2)
-
-        if isinstance(generator, list):
-            init_latents = [
-                retrieve_latents(self.vae.encode(video[i].unsqueeze(0)), generator=generator[i])
-                for i in range(batch_size)
-            ]
-        else:
-            init_latents = [retrieve_latents(self.vae.encode(vid.unsqueeze(0)), generator) for vid in video]
-
-        init_latents = torch.cat(init_latents, dim=0).to(dtype)
-
-        if self.vae.config.latents_mean is not None:
-            latents_mean, latents_std = self.vae.config.latents_mean, self.vae.config.latents_std
-            latents_mean = (
-                torch.tensor(latents_mean)
-                .view(1, self.vae.config.latent_channels, -1, 1, 1)[:, :, : init_latents.size(2)]
-                .to(init_latents)
-            )
-            latents_std = (
-                torch.tensor(latents_std)
-                .view(1, self.vae.config.latent_channels, -1, 1, 1)[:, :, : init_latents.size(2)]
-                .to(init_latents)
-            )
-            init_latents = (init_latents - latents_mean) * self.scheduler.config.sigma_data / latents_std
-        else:
-            init_latents = init_latents * self.scheduler.config.sigma_data
-
-        num_latent_frames = (num_frames - 1) // self.vae_scale_factor_temporal + 1
-        latent_height = height // self.vae_scale_factor_spatial
-        latent_width = width // self.vae_scale_factor_spatial
-        shape = (batch_size, num_channels_latents, num_latent_frames, latent_height, latent_width)
-
-        if latents is None:
-            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
-        else:
-            latents = latents.to(device=device, dtype=dtype)
-
-        latents = latents * self.scheduler.config.sigma_max
-
-        padding_shape = (batch_size, 1, num_latent_frames, latent_height, latent_width)
-        ones_padding = latents.new_ones(padding_shape)
-        zeros_padding = latents.new_zeros(padding_shape)
-
-        cond_indicator = latents.new_zeros(1, 1, latents.size(2), 1, 1)
-        cond_indicator[:, :, :num_cond_latent_frames] = 1.0
-        cond_mask = cond_indicator * ones_padding + (1 - cond_indicator) * zeros_padding
-
-        uncond_indicator = uncond_mask = None
-        if do_classifier_free_guidance:
-            uncond_indicator = latents.new_zeros(1, 1, latents.size(2), 1, 1)
-            uncond_indicator[:, :, :num_cond_latent_frames] = 1.0
-            uncond_mask = zeros_padding
-            if not input_frames_guidance:
-                uncond_mask = uncond_indicator * ones_padding + (1 - uncond_indicator) * zeros_padding
-
-        return latents, init_latents, cond_indicator, uncond_indicator, cond_mask, uncond_mask
-
-    def check_inputs(
-        self,
-        prompt,
-        height,
-        width,
-        prompt_embeds=None,
-        callback_on_step_end_tensor_inputs=None,
-        image=None,
-        video=None,
-    ):
-        if height % 16 != 0 or width % 16 != 0:
-            raise ValueError(f"`height` and `width` have to be divisible by 16 but are {height} and {width}.")
-
-        if callback_on_step_end_tensor_inputs is not None and not all(
-            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
-        ):
-            raise ValueError(
-                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
-            )
-
-        if prompt is not None and prompt_embeds is not None:
-            raise ValueError(
-                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
-                " only forward one of the two."
-            )
-        elif prompt is None and prompt_embeds is None:
-            raise ValueError(
-                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
-            )
-        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
-            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
-
-        if image is None and video is None:
-            raise ValueError("Either `image` or `video` has to be provided.")
-        if image is not None and video is not None:
-            raise ValueError("Only one of `image` or `video` has to be provided.")
-
-    @property
-    def guidance_scale(self):
-        return self._guidance_scale
-
-    @property
-    def do_classifier_free_guidance(self):
-        return self._guidance_scale > 1.0
-
-    @property
-    def num_timesteps(self):
-        return self._num_timesteps
-
-    @property
-    def current_timestep(self):
-        return self._current_timestep
-
-    @property
-    def interrupt(self):
-        return self._interrupt
-
-    @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
-    def __call__(
-        self,
-        image: PipelineImageInput = None,
-        video: List[PipelineImageInput] = None,
-        prompt: Union[str, List[str]] = None,
-        negative_prompt: Optional[Union[str, List[str]]] = None,
-        height: int = 704,
-        width: int = 1280,
-        num_frames: int = 121,
-        num_inference_steps: int = 36,
-        guidance_scale: float = 7.0,
-        input_frames_guidance: bool = False,
-        augment_sigma: float = 0.001,
-        fps: int = 30,
-        num_videos_per_prompt: Optional[int] = 1,
-        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
-        latents: Optional[torch.Tensor] = None,
-        prompt_embeds: Optional[torch.Tensor] = None,
-        negative_prompt_embeds: Optional[torch.Tensor] = None,
-        output_type: Optional[str] = "pil",
-        return_dict: bool = True,
-        callback_on_step_end: Optional[
-            Union[Callable[[int, int, Dict], None], PipelineCallback, MultiPipelineCallbacks]
-        ] = None,
-        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
-        max_sequence_length: int = 512,
-    ):
-        r"""
-        The call function to the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            height (`int`, defaults to `720`):
-                The height in pixels of the generated image.
-            width (`int`, defaults to `1280`):
-                The width in pixels of the generated image.
-            num_frames (`int`, defaults to `129`):
-                The number of frames in the generated video.
-            num_inference_steps (`int`, defaults to `50`):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            guidance_scale (`float`, defaults to `6.0`):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`.
-            fps (`int`, defaults to `30`):
-                The frames per second of the generated video.
-            num_videos_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
-                generation deterministic.
-            latents (`torch.Tensor`, *optional*):
-                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor is generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.Tensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. For PixArt-Sigma this negative prompt should be "". If not
-                provided, negative_prompt_embeds will be generated from `negative_prompt` input argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`CosmosPipelineOutput`] instead of a plain tuple.
-            clip_skip (`int`, *optional*):
-                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
-                the output of the pre-final layer will be used for computing the prompt embeddings.
-            callback_on_step_end (`Callable`, `PipelineCallback`, `MultiPipelineCallbacks`, *optional*):
-                A function or a subclass of `PipelineCallback` or `MultiPipelineCallbacks` that is called at the end of
-                each denoising step during the inference. with the following arguments: `callback_on_step_end(self:
-                DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict)`. `callback_kwargs` will include a
-                list of all tensors as specified by `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-
-        Examples:
-
-        Returns:
-            [`~CosmosPipelineOutput`] or `tuple`:
-                If `return_dict` is `True`, [`CosmosPipelineOutput`] is returned, otherwise a `tuple` is returned where
-                the first element is a list with the generated images and the second element is a list of `bool`s
-                indicating whether the corresponding generated image contains "not-safe-for-work" (nsfw) content.
-        """
-
-        if self.safety_checker is None:
-            raise ValueError(
-                f"You have disabled the safety checker for {self.__class__}. This is in violation of the "
-                "[NVIDIA Open Model License Agreement](https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-open-model-license). "
-                f"Please ensure that you are compliant with the license agreement."
-            )
-
-        if isinstance(callback_on_step_end, (PipelineCallback, MultiPipelineCallbacks)):
-            callback_on_step_end_tensor_inputs = callback_on_step_end.tensor_inputs
-
-        # 1. Check inputs. Raise error if not correct
-        self.check_inputs(prompt, height, width, prompt_embeds, callback_on_step_end_tensor_inputs, image, video)
-
-        self._guidance_scale = guidance_scale
-        self._current_timestep = None
-        self._interrupt = False
-
-        device = self._execution_device
-
-        if self.safety_checker is not None:
-            self.safety_checker.to(device)
-            if prompt is not None:
-                prompt_list = [prompt] if isinstance(prompt, str) else prompt
-                for p in prompt_list:
-                    if not self.safety_checker.check_text_safety(p):
-                        raise ValueError(
-                            f"Cosmos Guardrail detected unsafe text in the prompt: {p}. Please ensure that the "
-                            f"prompt abides by the NVIDIA Open Model License Agreement."
-                        )
-            self.safety_checker.to("cpu")
-
-        # 2. Define call parameters
-        if prompt is not None and isinstance(prompt, str):
-            batch_size = 1
-        elif prompt is not None and isinstance(prompt, list):
-            batch_size = len(prompt)
-        else:
-            batch_size = prompt_embeds.shape[0]
-
-        # 3. Encode input prompt
-        (
-            prompt_embeds,
-            negative_prompt_embeds,
-        ) = self.encode_prompt(
-            prompt=prompt,
-            negative_prompt=negative_prompt,
-            do_classifier_free_guidance=self.do_classifier_free_guidance,
-            num_videos_per_prompt=num_videos_per_prompt,
-            prompt_embeds=prompt_embeds,
-            negative_prompt_embeds=negative_prompt_embeds,
-            device=device,
-            max_sequence_length=max_sequence_length,
-        )
-
-        # 4. Prepare timesteps
-        timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device)
-
-        # 5. Prepare latent variables
-        vae_dtype = self.vae.dtype
-        transformer_dtype = self.transformer.dtype
-
-        if image is not None:
-            video = self.video_processor.preprocess(image, height, width).unsqueeze(2)
-        else:
-            video = self.video_processor.preprocess_video(video, height, width)
-        video = video.to(device=device, dtype=vae_dtype)
-
-        num_channels_latents = self.transformer.config.in_channels - 1
-        latents, conditioning_latents, cond_indicator, uncond_indicator, cond_mask, uncond_mask = self.prepare_latents(
-            video,
-            batch_size * num_videos_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            num_frames,
-            self.do_classifier_free_guidance,
-            input_frames_guidance,
-            torch.float32,
-            device,
-            generator,
-            latents,
-        )
-        cond_mask = cond_mask.to(transformer_dtype)
-        if self.do_classifier_free_guidance:
-            uncond_mask = uncond_mask.to(transformer_dtype)
-
-        augment_sigma = torch.tensor([augment_sigma], device=device, dtype=torch.float32)
-        padding_mask = latents.new_zeros(1, 1, height, width, dtype=transformer_dtype)
-
-        # 6. Denoising loop
-        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
-        self._num_timesteps = len(timesteps)
-
-        with self.progress_bar(total=num_inference_steps) as progress_bar:
-            for i, t in enumerate(timesteps):
-                if self.interrupt:
-                    continue
-
-                self._current_timestep = t
-                timestep = t.expand(latents.shape[0]).to(transformer_dtype)
-
-                current_sigma = self.scheduler.sigmas[i]
-                is_augment_sigma_greater = augment_sigma >= current_sigma
-
-                c_in_augment = self.scheduler._get_conditioning_c_in(augment_sigma)
-                c_in_original = self.scheduler._get_conditioning_c_in(current_sigma)
-
-                current_cond_indicator = cond_indicator * 0 if is_augment_sigma_greater else cond_indicator
-                cond_noise = randn_tensor(latents.shape, generator=generator, device=device, dtype=torch.float32)
-                cond_latent = conditioning_latents + cond_noise * augment_sigma[:, None, None, None, None]
-                cond_latent = cond_latent * c_in_augment / c_in_original
-                cond_latent = current_cond_indicator * cond_latent + (1 - current_cond_indicator) * latents
-                cond_latent = self.scheduler.scale_model_input(cond_latent, t)
-                cond_latent = cond_latent.to(transformer_dtype)
-
-                noise_pred = self.transformer(
-                    hidden_states=cond_latent,
-                    timestep=timestep,
-                    encoder_hidden_states=prompt_embeds,
-                    fps=fps,
-                    condition_mask=cond_mask,
-                    padding_mask=padding_mask,
-                    return_dict=False,
-                )[0]
-
-                sample = latents
-                if self.do_classifier_free_guidance:
-                    current_uncond_indicator = uncond_indicator * 0 if is_augment_sigma_greater else uncond_indicator
-                    uncond_noise = randn_tensor(latents.shape, generator=generator, device=device, dtype=torch.float32)
-                    uncond_latent = conditioning_latents + uncond_noise * augment_sigma[:, None, None, None, None]
-                    uncond_latent = uncond_latent * c_in_augment / c_in_original
-                    uncond_latent = current_uncond_indicator * uncond_latent + (1 - current_uncond_indicator) * latents
-                    uncond_latent = self.scheduler.scale_model_input(uncond_latent, t)
-                    uncond_latent = uncond_latent.to(transformer_dtype)
-
-                    noise_pred_uncond = self.transformer(
-                        hidden_states=uncond_latent,
-                        timestep=timestep,
-                        encoder_hidden_states=negative_prompt_embeds,
-                        fps=fps,
-                        condition_mask=uncond_mask,
-                        padding_mask=padding_mask,
-                        return_dict=False,
-                    )[0]
-                    noise_pred = torch.cat([noise_pred_uncond, noise_pred])
-                    sample = torch.cat([sample, sample])
-
-                # pred_original_sample (x0)
-                noise_pred = self.scheduler.step(noise_pred, t, sample, return_dict=False)[1]
-                self.scheduler._step_index -= 1
-
-                if self.do_classifier_free_guidance:
-                    noise_pred_uncond, noise_pred_cond = noise_pred.chunk(2, dim=0)
-                    noise_pred_uncond = (
-                        current_uncond_indicator * conditioning_latents
-                        + (1 - current_uncond_indicator) * noise_pred_uncond
-                    )
-                    noise_pred_cond = (
-                        current_cond_indicator * conditioning_latents + (1 - current_cond_indicator) * noise_pred_cond
-                    )
-                    noise_pred = noise_pred_cond + self.guidance_scale * (noise_pred_cond - noise_pred_uncond)
-                else:
-                    noise_pred = (
-                        current_cond_indicator * conditioning_latents + (1 - current_cond_indicator) * noise_pred
-                    )
-
-                # pred_sample (eps)
-                latents = self.scheduler.step(
-                    noise_pred, t, latents, return_dict=False, pred_original_sample=noise_pred
-                )[0]
-
-                if callback_on_step_end is not None:
-                    callback_kwargs = {}
-                    for k in callback_on_step_end_tensor_inputs:
-                        callback_kwargs[k] = locals()[k]
-                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
-
-                    latents = callback_outputs.pop("latents", latents)
-                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
-                    negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
-
-                # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
-                    progress_bar.update()
-
-                if XLA_AVAILABLE:
-                    xm.mark_step()
-
-        self._current_timestep = None
-
-        if not output_type == "latent":
-            if self.vae.config.latents_mean is not None:
-                latents_mean, latents_std = self.vae.config.latents_mean, self.vae.config.latents_std
-                latents_mean = (
-                    torch.tensor(latents_mean)
-                    .view(1, self.vae.config.latent_channels, -1, 1, 1)[:, :, : latents.size(2)]
-                    .to(latents)
-                )
-                latents_std = (
-                    torch.tensor(latents_std)
-                    .view(1, self.vae.config.latent_channels, -1, 1, 1)[:, :, : latents.size(2)]
-                    .to(latents)
-                )
-                latents = latents * latents_std / self.scheduler.config.sigma_data + latents_mean
-            else:
-                latents = latents / self.scheduler.config.sigma_data
-            video = self.vae.decode(latents.to(vae_dtype), return_dict=False)[0]
-
-            if self.safety_checker is not None:
-                self.safety_checker.to(device)
-                video = self.video_processor.postprocess_video(video, output_type="np")
-                video = (video * 255).astype(np.uint8)
-                video_batch = []
-                for vid in video:
-                    vid = self.safety_checker.check_video_safety(vid)
-                    video_batch.append(vid)
-                video = np.stack(video_batch).astype(np.float32) / 255.0 * 2 - 1
-                video = torch.from_numpy(video).permute(0, 4, 1, 2, 3)
-                video = self.video_processor.postprocess_video(video, output_type=output_type)
-                self.safety_checker.to("cpu")
-            else:
-                video = self.video_processor.postprocess_video(video, output_type=output_type)
-        else:
-            video = latents
-
-        # Offload all models
-        self.maybe_free_model_hooks()
-
-        if not return_dict:
-            return (video,)
-
-        return CosmosPipelineOutput(frames=video)
@@ -1,20 +0,0 @@
-from dataclasses import dataclass
-
-import torch
-
-from diffusers.utils import BaseOutput
-
-
-@dataclass
-class CosmosPipelineOutput(BaseOutput):
-    r"""
-    Output class for Cosmos pipelines.
-
-    Args:
-        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
-            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
-            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
-            `(batch_size, num_frames, channels, height, width)`.
-    """
-
-    frames: torch.Tensor
@@ -24,7 +24,6 @@ except OptionalDependencyNotAvailable:
 else:
    _import_structure["pipeline_hunyuan_skyreels_image2video"] = ["HunyuanSkyreelsImageToVideoPipeline"]
    _import_structure["pipeline_hunyuan_video"] = ["HunyuanVideoPipeline"]
-    _import_structure["pipeline_hunyuan_video_framepack"] = ["HunyuanVideoFramepackPipeline"]
    _import_structure["pipeline_hunyuan_video_image2video"] = ["HunyuanVideoImageToVideoPipeline"]

 if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -37,7 +36,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
    else:
        from .pipeline_hunyuan_skyreels_image2video import HunyuanSkyreelsImageToVideoPipeline
        from .pipeline_hunyuan_video import HunyuanVideoPipeline
-        from .pipeline_hunyuan_video_framepack import HunyuanVideoFramepackPipeline
        from .pipeline_hunyuan_video_image2video import HunyuanVideoImageToVideoPipeline

 else:
@@ -1,8 +1,5 @@
 from dataclasses import dataclass
-from typing import List, Union

-import numpy as np
-import PIL.Image
 import torch

 from diffusers.utils import BaseOutput
@@ -21,19 +18,3 @@ class HunyuanVideoPipelineOutput(BaseOutput):
    """

    frames: torch.Tensor
-
-
-@dataclass
-class HunyuanVideoFramepackPipelineOutput(BaseOutput):
-    r"""
-    Output class for HunyuanVideo pipelines.
-
-    Args:
-        frames (`torch.Tensor`, `np.ndarray`, or List[List[PIL.Image.Image]]):
-            List of video outputs - It can be a nested list of length `batch_size,` with each sub-list containing
-            denoised PIL image sequences of length `num_frames.` It can also be a NumPy array or Torch tensor of shape
-            `(batch_size, num_frames, channels, height, width)`. Or, a list of torch tensors where each tensor
-            corresponds to a latent that decodes to multiple frames.
-    """
-
-    frames: Union[torch.Tensor, np.ndarray, List[List[PIL.Image.Image]], List[torch.Tensor]]
@@ -675,10 +675,8 @@ def load_sub_model(
    use_safetensors: bool,
    dduf_entries: Optional[Dict[str, DDUFEntry]],
    provider_options: Any,
-    quantization_config: Optional[Any] = None,
 ):
    """Helper method to load the module `name` from `library_name` and `class_name`"""
-    from ..quantizers import PipelineQuantizationConfig

    # retrieve class candidates

@@ -771,17 +769,6 @@ def load_sub_model(
        else:
            loading_kwargs["low_cpu_mem_usage"] = False

-    if (
-        quantization_config is not None
-        and isinstance(quantization_config, PipelineQuantizationConfig)
-        and issubclass(class_obj, torch.nn.Module)
-    ):
-        model_quant_config = quantization_config._resolve_quant_config(
-            is_diffusers=is_diffusers_model, module_name=name
-        )
-        if model_quant_config is not None:
-            loading_kwargs["quantization_config"] = model_quant_config
-
    # check if the module is in a subdirectory
    if dduf_entries:
        loading_kwargs["dduf_entries"] = dduf_entries
@@ -47,7 +47,6 @@ from ..configuration_utils import ConfigMixin
 from ..models import AutoencoderKL
 from ..models.attention_processor import FusedAttnProcessor2_0
 from ..models.modeling_utils import _LOW_CPU_MEM_USAGE_DEFAULT, ModelMixin
-from ..quantizers import PipelineQuantizationConfig
 from ..quantizers.bitsandbytes.utils import _check_bnb_status
 from ..schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
 from ..utils import (
@@ -726,7 +725,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
        use_safetensors = kwargs.pop("use_safetensors", None)
        use_onnx = kwargs.pop("use_onnx", None)
        load_connected_pipeline = kwargs.pop("load_connected_pipeline", False)
-        quantization_config = kwargs.pop("quantization_config", None)

        if torch_dtype is not None and not isinstance(torch_dtype, dict) and not isinstance(torch_dtype, torch.dtype):
            torch_dtype = torch.float32
@@ -743,9 +741,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                " install accelerate\n```\n."
            )

-        if quantization_config is not None and not isinstance(quantization_config, PipelineQuantizationConfig):
-            raise ValueError("`quantization_config` must be an instance of `PipelineQuantizationConfig`.")
-
        if low_cpu_mem_usage is True and not is_torch_version(">=", "1.9.0"):
            raise NotImplementedError(
                "Low memory initialization requires torch >= 1.9.0. Please either update your PyTorch version or set"
@@ -1006,7 +1001,6 @@ class DiffusionPipeline(ConfigMixin, PushToHubMixin):
                    use_safetensors=use_safetensors,
                    dduf_entries=dduf_entries,
                    provider_options=provider_options,
-                    quantization_config=quantization_config,
                )
                logger.info(
                    f"Loaded {name} as {class_name} from `{name}` subfolder of {pretrained_model_name_or_path}."
@@ -30,11 +30,18 @@ except OptionalDependencyNotAvailable:
    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
    _import_structure["clip_image_project_model"] = ["CLIPImageProjection"]
+    _import_structure["pipeline_cycle_diffusion"] = ["CycleDiffusionPipeline"]
    _import_structure["pipeline_stable_diffusion"] = ["StableDiffusionPipeline"]
+    _import_structure["pipeline_stable_diffusion_attend_and_excite"] = ["StableDiffusionAttendAndExcitePipeline"]
+    _import_structure["pipeline_stable_diffusion_gligen"] = ["StableDiffusionGLIGENPipeline"]
+    _import_structure["pipeline_stable_diffusion_gligen_text_image"] = ["StableDiffusionGLIGENTextImagePipeline"]
    _import_structure["pipeline_stable_diffusion_img2img"] = ["StableDiffusionImg2ImgPipeline"]
    _import_structure["pipeline_stable_diffusion_inpaint"] = ["StableDiffusionInpaintPipeline"]
+    _import_structure["pipeline_stable_diffusion_inpaint_legacy"] = ["StableDiffusionInpaintPipelineLegacy"]
    _import_structure["pipeline_stable_diffusion_instruct_pix2pix"] = ["StableDiffusionInstructPix2PixPipeline"]
    _import_structure["pipeline_stable_diffusion_latent_upscale"] = ["StableDiffusionLatentUpscalePipeline"]
+    _import_structure["pipeline_stable_diffusion_model_editing"] = ["StableDiffusionModelEditingPipeline"]
+    _import_structure["pipeline_stable_diffusion_paradigms"] = ["StableDiffusionParadigmsPipeline"]
    _import_structure["pipeline_stable_diffusion_upscale"] = ["StableDiffusionUpscalePipeline"]
    _import_structure["pipeline_stable_unclip"] = ["StableUnCLIPPipeline"]
    _import_structure["pipeline_stable_unclip_img2img"] = ["StableUnCLIPImg2ImgPipeline"]
@@ -12,183 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import inspect
-from typing import Dict, List, Optional, Union
-
-from ..utils import is_transformers_available, logging
 from .auto import DiffusersAutoQuantizer
 from .base import DiffusersQuantizer
-from .quantization_config import QuantizationConfigMixin as DiffQuantConfigMixin
-
-
-try:
-    from transformers.utils.quantization_config import QuantizationConfigMixin as TransformersQuantConfigMixin
-except ImportError:
-
-    class TransformersQuantConfigMixin:
-        pass
-
-
-logger = logging.get_logger(__name__)
-
-
-class PipelineQuantizationConfig:
-    """
-    Configuration class to be used when applying quantization on-the-fly to [`~DiffusionPipeline.from_pretrained`].
-
-    Args:
-        quant_backend (`str`): Quantization backend to be used. When using this option, we assume that the backend
-            is available to both `diffusers` and `transformers`.
-        quant_kwargs (`dict`): Params to initialize the quantization backend class.
-        components_to_quantize (`list`): Components of a pipeline to be quantized.
-        quant_mapping (`dict`): Mapping defining the quantization specs to be used for the pipeline
-            components. When using this argument, users are not expected to provide `quant_backend`, `quant_kawargs`,
-            and `components_to_quantize`.
-    """
-
-    def __init__(
-        self,
-        quant_backend: str = None,
-        quant_kwargs: Dict[str, Union[str, float, int, dict]] = None,
-        components_to_quantize: Optional[List[str]] = None,
-        quant_mapping: Dict[str, Union[DiffQuantConfigMixin, "TransformersQuantConfigMixin"]] = None,
-    ):
-        self.quant_backend = quant_backend
-        # Initialize kwargs to be {} to set to the defaults.
-        self.quant_kwargs = quant_kwargs or {}
-        self.components_to_quantize = components_to_quantize
-        self.quant_mapping = quant_mapping
-
-        self.post_init()
-
-    def post_init(self):
-        quant_mapping = self.quant_mapping
-        self.is_granular = True if quant_mapping is not None else False
-
-        self._validate_init_args()
-
-    def _validate_init_args(self):
-        if self.quant_backend and self.quant_mapping:
-            raise ValueError("Both `quant_backend` and `quant_mapping` cannot be specified at the same time.")
-
-        if not self.quant_mapping and not self.quant_backend:
-            raise ValueError("Must provide a `quant_backend` when not providing a `quant_mapping`.")
-
-        if not self.quant_kwargs and not self.quant_mapping:
-            raise ValueError("Both `quant_kwargs` and `quant_mapping` cannot be None.")
-
-        if self.quant_backend is not None:
-            self._validate_init_kwargs_in_backends()
-
-        if self.quant_mapping is not None:
-            self._validate_quant_mapping_args()
-
-    def _validate_init_kwargs_in_backends(self):
-        quant_backend = self.quant_backend
-
-        self._check_backend_availability(quant_backend)
-
-        quant_config_mapping_transformers, quant_config_mapping_diffusers = self._get_quant_config_list()
-
-        if quant_config_mapping_transformers is not None:
-            init_kwargs_transformers = inspect.signature(quant_config_mapping_transformers[quant_backend].__init__)
-            init_kwargs_transformers = {name for name in init_kwargs_transformers.parameters if name != "self"}
-        else:
-            init_kwargs_transformers = None
-
-        init_kwargs_diffusers = inspect.signature(quant_config_mapping_diffusers[quant_backend].__init__)
-        init_kwargs_diffusers = {name for name in init_kwargs_diffusers.parameters if name != "self"}
-
-        if init_kwargs_transformers != init_kwargs_diffusers:
-            raise ValueError(
-                "The signatures of the __init__ methods of the quantization config classes in `diffusers` and `transformers` don't match. "
-                f"Please provide a `quant_mapping` instead, in the {self.__class__.__name__} class. Refer to [the docs](https://huggingface.co/docs/diffusers/main/en/quantization/overview#pipeline-level-quantization) to learn more about how "
-                "this mapping would look like."
-            )
-
-    def _validate_quant_mapping_args(self):
-        quant_mapping = self.quant_mapping
-        transformers_map, diffusers_map = self._get_quant_config_list()
-
-        available_transformers = list(transformers_map.values()) if transformers_map else None
-        available_diffusers = list(diffusers_map.values())
-
-        for module_name, config in quant_mapping.items():
-            if any(isinstance(config, cfg) for cfg in available_diffusers):
-                continue
-
-            if available_transformers and any(isinstance(config, cfg) for cfg in available_transformers):
-                continue
-
-            if available_transformers:
-                raise ValueError(
-                    f"Provided config for module_name={module_name} could not be found. "
-                    f"Available diffusers configs: {available_diffusers}; "
-                    f"Available transformers configs: {available_transformers}."
-                )
-            else:
-                raise ValueError(
-                    f"Provided config for module_name={module_name} could not be found. "
-                    f"Available diffusers configs: {available_diffusers}."
-                )
-
-    def _check_backend_availability(self, quant_backend: str):
-        quant_config_mapping_transformers, quant_config_mapping_diffusers = self._get_quant_config_list()
-
-        available_backends_transformers = (
-            list(quant_config_mapping_transformers.keys()) if quant_config_mapping_transformers else None
-        )
-        available_backends_diffusers = list(quant_config_mapping_diffusers.keys())
-
-        if (
-            available_backends_transformers and quant_backend not in available_backends_transformers
-        ) or quant_backend not in quant_config_mapping_diffusers:
-            error_message = f"Provided quant_backend={quant_backend} was not found."
-            if available_backends_transformers:
-                error_message += f"\nAvailable ones (transformers): {available_backends_transformers}."
-            error_message += f"\nAvailable ones (diffusers): {available_backends_diffusers}."
-            raise ValueError(error_message)
-
-    def _resolve_quant_config(self, is_diffusers: bool = True, module_name: str = None):
-        quant_config_mapping_transformers, quant_config_mapping_diffusers = self._get_quant_config_list()
-
-        quant_mapping = self.quant_mapping
-        components_to_quantize = self.components_to_quantize
-
-        # Granular case
-        if self.is_granular and module_name in quant_mapping:
-            logger.debug(f"Initializing quantization config class for {module_name}.")
-            config = quant_mapping[module_name]
-            return config
-
-        # Global config case
-        else:
-            should_quantize = False
-            # Only quantize the modules requested for.
-            if components_to_quantize and module_name in components_to_quantize:
-                should_quantize = True
-            # No specification for `components_to_quantize` means all modules should be quantized.
-            elif not self.is_granular and not components_to_quantize:
-                should_quantize = True
-
-            if should_quantize:
-                logger.debug(f"Initializing quantization config class for {module_name}.")
-                mapping_to_use = quant_config_mapping_diffusers if is_diffusers else quant_config_mapping_transformers
-                quant_config_cls = mapping_to_use[self.quant_backend]
-                quant_kwargs = self.quant_kwargs
-                return quant_config_cls(**quant_kwargs)
-
-        # Fallback: no applicable configuration found.
-        return None
-
-    def _get_quant_config_list(self):
-        if is_transformers_available():
-            from transformers.quantizers.auto import (
-                AUTO_QUANTIZATION_CONFIG_MAPPING as quant_config_mapping_transformers,
-            )
-        else:
-            quant_config_mapping_transformers = None
-
-        from ..quantizers.auto import AUTO_QUANTIZATION_CONFIG_MAPPING as quant_config_mapping_diffusers
-
-        return quant_config_mapping_transformers, quant_config_mapping_diffusers
@@ -75,7 +75,7 @@ class QuantizationConfigMixin:
        Args:
            config_dict (`Dict[str, Any]`):
                Dictionary that will be used to instantiate the configuration object.
-            return_unused_kwargs (`bool`, *optional*, defaults to `False`):
+            return_unused_kwargs (`bool`,*optional*, defaults to `False`):
                Whether or not to return a list of unused keyword arguments. Used for `from_pretrained` method in
                `PreTrainedModel`.
            kwargs (`Dict[str, Any]`):
@@ -144,7 +144,7 @@ class CosineDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):

    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.precondition_inputs
    def precondition_inputs(self, sample, sigma):
-        c_in = self._get_conditioning_c_in(sigma)
+        c_in = 1 / ((sigma**2 + self.config.sigma_data**2) ** 0.5)
        scaled_sample = sample * c_in
        return scaled_sample

@@ -568,10 +568,5 @@ class CosineDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        noisy_samples = original_samples + noise * sigma
        return noisy_samples

-    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._get_conditioning_c_in
-    def _get_conditioning_c_in(self, sigma):
-        c_in = 1 / ((sigma**2 + self.config.sigma_data**2) ** 0.5)
-        return c_in
-
    def __len__(self):
        return self.config.num_train_timesteps
@@ -176,7 +176,7 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):

    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler.precondition_inputs
    def precondition_inputs(self, sample, sigma):
-        c_in = self._get_conditioning_c_in(sigma)
+        c_in = 1 / ((sigma**2 + self.config.sigma_data**2) ** 0.5)
        scaled_sample = sample * c_in
        return scaled_sample

@@ -703,10 +703,5 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
        noisy_samples = original_samples + noise * sigma
        return noisy_samples

-    # Copied from diffusers.schedulers.scheduling_edm_euler.EDMEulerScheduler._get_conditioning_c_in
-    def _get_conditioning_c_in(self, sigma):
-        c_in = 1 / ((sigma**2 + self.config.sigma_data**2) ** 0.5)
-        return c_in
-
    def __len__(self):
        return self.config.num_train_timesteps
@@ -103,13 +103,11 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
        # setable values
        self.num_inference_steps = None

-        sigmas_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
-        sigmas = torch.arange(num_train_timesteps + 1, dtype=sigmas_dtype) / num_train_timesteps
+        sigmas = torch.arange(num_train_timesteps + 1) / num_train_timesteps
        if sigma_schedule == "karras":
            sigmas = self._compute_karras_sigmas(sigmas)
        elif sigma_schedule == "exponential":
            sigmas = self._compute_exponential_sigmas(sigmas)
-        sigmas = sigmas.to(torch.float32)

        self.timesteps = self.precondition_noise(sigmas)

@@ -161,7 +159,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
        self._begin_index = begin_index

    def precondition_inputs(self, sample, sigma):
-        c_in = self._get_conditioning_c_in(sigma)
+        c_in = 1 / ((sigma**2 + self.config.sigma_data**2) ** 0.5)
        scaled_sample = sample * c_in
        return scaled_sample

@@ -232,19 +230,18 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
        """
        self.num_inference_steps = num_inference_steps

-        sigmas_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
        if sigmas is None:
-            sigmas = torch.linspace(0, 1, self.num_inference_steps, dtype=sigmas_dtype)
+            sigmas = torch.linspace(0, 1, self.num_inference_steps)
        elif isinstance(sigmas, float):
-            sigmas = torch.tensor(sigmas, dtype=sigmas_dtype)
+            sigmas = torch.tensor(sigmas, dtype=torch.float32)
        else:
-            sigmas = sigmas.to(sigmas_dtype)
+            sigmas = sigmas
        if self.config.sigma_schedule == "karras":
            sigmas = self._compute_karras_sigmas(sigmas)
        elif self.config.sigma_schedule == "exponential":
            sigmas = self._compute_exponential_sigmas(sigmas)
-        sigmas = sigmas.to(dtype=torch.float32, device=device)

+        sigmas = sigmas.to(dtype=torch.float32, device=device)
        self.timesteps = self.precondition_noise(sigmas)

        if self.config.final_sigmas_type == "sigma_min":
@@ -318,7 +315,6 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
        s_noise: float = 1.0,
        generator: Optional[torch.Generator] = None,
        return_dict: bool = True,
-        pred_original_sample: Optional[torch.Tensor] = None,
    ) -> Union[EDMEulerSchedulerOutput, Tuple]:
        """
        Predict the sample from the previous timestep by reversing the SDE. This function propagates the diffusion
@@ -382,8 +378,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
            sample = sample + eps * (sigma_hat**2 - sigma**2) ** 0.5

        # 1. compute predicted original sample (x_0) from sigma-scaled predicted noise
-        if pred_original_sample is None:
-            pred_original_sample = self.precondition_outputs(sample, model_output, sigma_hat)
+        pred_original_sample = self.precondition_outputs(sample, model_output, sigma_hat)

        # 2. Convert to an ODE derivative
        derivative = (sample - pred_original_sample) / sigma_hat
@@ -440,9 +435,5 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
        noisy_samples = original_samples + noise * sigma
        return noisy_samples

-    def _get_conditioning_c_in(self, sigma):
-        c_in = 1 / ((sigma**2 + self.config.sigma_data**2) ** 0.5)
-        return c_in
-
    def __len__(self):
        return self.config.num_train_timesteps
@@ -62,11 +62,9 @@ from .import_utils import (
    get_objects_from_module,
    is_accelerate_available,
    is_accelerate_version,
-    is_better_profanity_available,
    is_bitsandbytes_available,
    is_bitsandbytes_version,
    is_bs4_available,
-    is_cosmos_guardrail_available,
    is_flax_available,
    is_ftfy_available,
    is_gguf_available,
@@ -80,7 +78,6 @@ from .import_utils import (
    is_k_diffusion_version,
    is_librosa_available,
    is_matplotlib_available,
-    is_nltk_available,
    is_note_seq_available,
    is_onnx_available,
    is_opencv_available,
@@ -88,7 +85,6 @@ from .import_utils import (
    is_optimum_quanto_version,
    is_peft_available,
    is_peft_version,
-    is_pytorch_retinaface_available,
    is_safetensors_available,
    is_scipy_available,
    is_sentencepiece_available,
@@ -160,21 +160,6 @@ class AutoencoderKLCogVideoX(metaclass=DummyObject):
        requires_backends(cls, ["torch"])


-class AutoencoderKLCosmos(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-
 class AutoencoderKLHunyuanVideo(metaclass=DummyObject):
    _backends = ["torch"]

@@ -445,21 +430,6 @@ class ControlNetXSAdapter(metaclass=DummyObject):
        requires_backends(cls, ["torch"])


-class CosmosTransformer3DModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-
 class DiTTransformer2DModel(metaclass=DummyObject):
    _backends = ["torch"]

@@ -595,21 +565,6 @@ class HunyuanDiT2DMultiControlNetModel(metaclass=DummyObject):
        requires_backends(cls, ["torch"])


-class HunyuanVideoFramepackTransformer3DModel(metaclass=DummyObject):
-    _backends = ["torch"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch"])
-
-
 class HunyuanVideoTransformer3DModel(metaclass=DummyObject):
    _backends = ["torch"]

@@ -392,51 +392,6 @@ class CogView4Pipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])


-class ConsisIDPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
-class CosmosTextToWorldPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
-class CosmosVideoToWorldPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class CycleDiffusionPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

@@ -737,21 +692,6 @@ class HunyuanSkyreelsImageToVideoPipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])


-class HunyuanVideoFramepackPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers"])
-
-
 class HunyuanVideoImageToVideoPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

@@ -215,10 +215,6 @@ _gguf_available, _gguf_version = _is_package_available("gguf")
 _torchao_available, _torchao_version = _is_package_available("torchao")
 _bitsandbytes_available, _bitsandbytes_version = _is_package_available("bitsandbytes")
 _optimum_quanto_available, _optimum_quanto_version = _is_package_available("optimum", get_dist_name=True)
-_pytorch_retinaface_available, _pytorch_retinaface_version = _is_package_available("pytorch_retinaface")
-_better_profanity_available, _better_profanity_version = _is_package_available("better_profanity")
-_nltk_available, _nltk_version = _is_package_available("nltk")
-_cosmos_guardrail_available, _cosmos_guardrail_version = _is_package_available("cosmos_guardrail")


 def is_torch_available():
@@ -357,22 +353,6 @@ def is_timm_available():
    return _timm_available


-def is_pytorch_retinaface_available():
-    return _pytorch_retinaface_available
-
-
-def is_better_profanity_available():
-    return _better_profanity_available
-
-
-def is_nltk_available():
-    return _nltk_available
-
-
-def is_cosmos_guardrail_available():
-    return _cosmos_guardrail_available
-
-
 def is_hpu_available():
    return all(importlib.util.find_spec(lib) for lib in ("habana_frameworks", "habana_frameworks.torch"))

@@ -525,22 +505,6 @@ QUANTO_IMPORT_ERROR = """
 install optimum-quanto`
 """

-# docstyle-ignore
-PYTORCH_RETINAFACE_IMPORT_ERROR = """
-{0} requires the pytorch_retinaface library but it was not found in your environment. You can install it with pip: `pip install pytorch_retinaface`
-"""
-
-# docstyle-ignore
-BETTER_PROFANITY_IMPORT_ERROR = """
-{0} requires the better_profanity library but it was not found in your environment. You can install it with pip: `pip install better_profanity`
-"""
-
-# docstyle-ignore
-NLTK_IMPORT_ERROR = """
-{0} requires the nltk library but it was not found in your environment. You can install it with pip: `pip install nltk`
-"""
-
-
 BACKENDS_MAPPING = OrderedDict(
    [
        ("bs4", (is_bs4_available, BS4_IMPORT_ERROR)),
@@ -569,9 +533,6 @@ BACKENDS_MAPPING = OrderedDict(
        ("gguf", (is_gguf_available, GGUF_IMPORT_ERROR)),
        ("torchao", (is_torchao_available, TORCHAO_IMPORT_ERROR)),
        ("quanto", (is_optimum_quanto_available, QUANTO_IMPORT_ERROR)),
-        ("pytorch_retinaface", (is_pytorch_retinaface_available, PYTORCH_RETINAFACE_IMPORT_ERROR)),
-        ("better_profanity", (is_better_profanity_available, BETTER_PROFANITY_IMPORT_ERROR)),
-        ("nltk", (is_nltk_available, NLTK_IMPORT_ERROR)),
    ]
 )

@@ -38,7 +38,6 @@ from .import_utils import (
    is_note_seq_available,
    is_onnx_available,
    is_opencv_available,
-    is_optimum_quanto_available,
    is_peft_available,
    is_timm_available,
    is_torch_available,
@@ -487,13 +486,6 @@ def require_bitsandbytes(test_case):
    return unittest.skipUnless(is_bitsandbytes_available(), "test requires bitsandbytes")(test_case)


-def require_quanto(test_case):
-    """
-    Decorator marking a test that requires quanto. These tests are skipped when quanto isn't installed.
-    """
-    return unittest.skipUnless(is_optimum_quanto_available(), "test requires quanto")(test_case)
-
-
 def require_accelerate(test_case):
    """
    Decorator marking a test that requires accelerate. These tests are skipped when accelerate isn't installed.
@@ -31,14 +31,13 @@ from diffusers import FlowMatchEulerDiscreteScheduler, FluxControlPipeline, Flux
 from diffusers.utils import load_image, logging
 from diffusers.utils.testing_utils import (
    CaptureLogger,
-    backend_empty_cache,
    floats_tensor,
    is_peft_available,
    nightly,
    numpy_cosine_similarity_distance,
-    require_big_accelerator,
+    require_big_gpu_with_torch_cuda,
    require_peft_backend,
-    require_torch_accelerator,
+    require_torch_gpu,
    slow,
    torch_device,
 )
@@ -810,10 +809,10 @@ class FluxControlLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):

@slow
@nightly
-@require_torch_accelerator
+@require_torch_gpu
@require_peft_backend
-@require_big_accelerator
-@pytest.mark.big_accelerator
+@require_big_gpu_with_torch_cuda
+@pytest.mark.big_gpu_with_torch_cuda
 class FluxLoRAIntegrationTests(unittest.TestCase):
    """internal note: The integration slices were obtained on audace.

@@ -828,7 +827,7 @@ class FluxLoRAIntegrationTests(unittest.TestCase):
        super().setUp()

        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

        self.pipeline = FluxPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16)

@@ -837,13 +836,13 @@ class FluxLoRAIntegrationTests(unittest.TestCase):

        del self.pipeline
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def test_flux_the_last_ben(self):
        self.pipeline.load_lora_weights("TheLastBen/Jon_Snow_Flux_LoRA", weight_name="jon_snow.safetensors")
        self.pipeline.fuse_lora()
        self.pipeline.unload_lora_weights()
-        # Instead of calling `enable_model_cpu_offload()`, we do a accelerator placement here because the CI
+        # Instead of calling `enable_model_cpu_offload()`, we do a cuda placement here because the CI
        # run supports it. We have about 34GB RAM in the CI runner which kills the test when run with
        # `enable_model_cpu_offload()`. We repeat this for the other tests, too.
        self.pipeline = self.pipeline.to(torch_device)
@@ -957,10 +956,10 @@ class FluxLoRAIntegrationTests(unittest.TestCase):


@nightly
-@require_torch_accelerator
+@require_torch_gpu
@require_peft_backend
-@require_big_accelerator
-@pytest.mark.big_accelerator
+@require_big_gpu_with_torch_cuda
+@pytest.mark.big_gpu_with_torch_cuda
 class FluxControlLoRAIntegrationTests(unittest.TestCase):
    num_inference_steps = 10
    seed = 0
@@ -970,17 +969,17 @@ class FluxControlLoRAIntegrationTests(unittest.TestCase):
        super().setUp()

        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

        self.pipeline = FluxControlPipeline.from_pretrained(
            "black-forest-labs/FLUX.1-dev", torch_dtype=torch.bfloat16
-        ).to(torch_device)
+        ).to("cuda")

    def tearDown(self):
        super().tearDown()

        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    @parameterized.expand(["black-forest-labs/FLUX.1-Canny-dev-lora", "black-forest-labs/FLUX.1-Depth-dev-lora"])
    def test_lora(self, lora_ckpt_id):
@@ -28,16 +28,13 @@ from diffusers import (
    HunyuanVideoTransformer3DModel,
 )
 from diffusers.utils.testing_utils import (
-    Expectations,
-    backend_empty_cache,
    floats_tensor,
    nightly,
    numpy_cosine_similarity_distance,
-    require_big_accelerator,
+    require_big_gpu_with_torch_cuda,
    require_peft_backend,
-    require_torch_accelerator,
+    require_torch_gpu,
    skip_mps,
-    torch_device,
 )


@@ -195,10 +192,10 @@ class HunyuanVideoLoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):


@nightly
-@require_torch_accelerator
+@require_torch_gpu
@require_peft_backend
-@require_big_accelerator
-@pytest.mark.big_accelerator
+@require_big_gpu_with_torch_cuda
+@pytest.mark.big_gpu_with_torch_cuda
 class HunyuanVideoLoRAIntegrationTests(unittest.TestCase):
    """internal note: The integration slices were obtained on DGX.

@@ -213,7 +210,7 @@ class HunyuanVideoLoRAIntegrationTests(unittest.TestCase):
        super().setUp()

        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

        model_id = "hunyuanvideo-community/HunyuanVideo"
        transformer = HunyuanVideoTransformer3DModel.from_pretrained(
@@ -221,13 +218,13 @@ class HunyuanVideoLoRAIntegrationTests(unittest.TestCase):
        )
        self.pipeline = HunyuanVideoPipeline.from_pretrained(
            model_id, transformer=transformer, torch_dtype=torch.float16
-        ).to(torch_device)
+        ).to("cuda")

    def tearDown(self):
        super().tearDown()

        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def test_original_format_cseti(self):
        self.pipeline.load_lora_weights(
@@ -252,13 +249,8 @@ class HunyuanVideoLoRAIntegrationTests(unittest.TestCase):
        out_slice = np.concatenate((out[:8], out[-8:]))

        # fmt: off
-        expected_slices = Expectations(
-            {
-                ("cuda", 7): np.array([0.1013, 0.1924, 0.0078, 0.1021, 0.1929, 0.0078, 0.1023, 0.1919, 0.7402, 0.104, 0.4482, 0.7354, 0.0925, 0.4382, 0.7275, 0.0815]),
-            }
-        )
+        expected_slice = np.array([0.1013, 0.1924, 0.0078, 0.1021, 0.1929, 0.0078, 0.1023, 0.1919, 0.7402, 0.104, 0.4482, 0.7354, 0.0925, 0.4382, 0.7275, 0.0815])
        # fmt: on
-        expected_slice = expected_slices.get_expectation()

        max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), out_slice)

@@ -93,12 +93,12 @@ class StableDiffusionLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    # Keeping this test here makes sense because it doesn't look any integration
    # (value assertions on logits).
@@ -34,7 +34,7 @@ from diffusers.utils.testing_utils import (
    is_flaky,
    nightly,
    numpy_cosine_similarity_distance,
-    require_big_accelerator,
+    require_big_gpu_with_torch_cuda,
    require_peft_backend,
    require_torch_accelerator,
    torch_device,
@@ -138,8 +138,8 @@ class SD3LoRATests(unittest.TestCase, PeftLoraLoaderMixinTests):
@nightly
@require_torch_accelerator
@require_peft_backend
-@require_big_accelerator
-@pytest.mark.big_accelerator
+@require_big_gpu_with_torch_cuda
+@pytest.mark.big_gpu_with_torch_cuda
 class SD3LoraIntegrationTests(unittest.TestCase):
    pipeline_class = StableDiffusion3Img2ImgPipeline
    repo_id = "stabilityai/stable-diffusion-3-medium-diffusers"
@@ -37,13 +37,12 @@ from diffusers.utils import logging
 from diffusers.utils.import_utils import is_accelerate_available
 from diffusers.utils.testing_utils import (
    CaptureLogger,
-    backend_empty_cache,
    is_flaky,
    load_image,
    nightly,
    numpy_cosine_similarity_distance,
    require_peft_backend,
-    require_torch_accelerator,
+    require_torch_gpu,
    slow,
    torch_device,
 )
@@ -106,12 +105,12 @@ class StableDiffusionXLLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    @is_flaky
    def test_multiple_wrong_adapter_name_raises_error(self):
@@ -120,18 +119,18 @@ class StableDiffusionXLLoRATests(PeftLoraLoaderMixinTests, unittest.TestCase):

@slow
@nightly
-@require_torch_accelerator
+@require_torch_gpu
@require_peft_backend
 class LoraSDXLIntegrationTests(unittest.TestCase):
    def setUp(self):
        super().setUp()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def tearDown(self):
        super().tearDown()
        gc.collect()
-        backend_empty_cache(torch_device)
+        torch.cuda.empty_cache()

    def test_sdxl_1_0_lora(self):
        generator = torch.Generator("cpu").manual_seed(0)
@@ -1,86 +0,0 @@
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-from diffusers import AutoencoderKLCosmos
-from diffusers.utils.testing_utils import enable_full_determinism, floats_tensor, torch_device
-
-from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin
-
-
-enable_full_determinism()
-
-
-class AutoencoderKLCosmosTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
-    model_class = AutoencoderKLCosmos
-    main_input_name = "sample"
-    base_precision = 1e-2
-
-    def get_autoencoder_kl_cosmos_config(self):
-        return {
-            "in_channels": 3,
-            "out_channels": 3,
-            "latent_channels": 4,
-            "encoder_block_out_channels": (8, 8, 8, 8),
-            "decode_block_out_channels": (8, 8, 8, 8),
-            "attention_resolutions": (8,),
-            "resolution": 64,
-            "num_layers": 2,
-            "patch_size": 4,
-            "patch_type": "haar",
-            "scaling_factor": 1.0,
-            "spatial_compression_ratio": 4,
-            "temporal_compression_ratio": 4,
-        }
-
-    @property
-    def dummy_input(self):
-        batch_size = 2
-        num_frames = 9
-        num_channels = 3
-        height = 32
-        width = 32
-
-        image = floats_tensor((batch_size, num_channels, num_frames, height, width)).to(torch_device)
-
-        return {"sample": image}
-
-    @property
-    def input_shape(self):
-        return (3, 9, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (3, 9, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = self.get_autoencoder_kl_cosmos_config()
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {
-            "CosmosEncoder3d",
-            "CosmosDecoder3d",
-        }
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-    @unittest.skip("Not sure why this test fails. Investigate later.")
-    def test_effective_gradient_checkpointing(self):
-        pass
-
-    @unittest.skip("Unsupported test.")
-    def test_forward_with_norm_groups(self):
-        pass
@@ -62,6 +62,7 @@ from diffusers.utils.testing_utils import (
    backend_max_memory_allocated,
    backend_reset_peak_memory_stats,
    backend_synchronize,
+    floats_tensor,
    get_python_version,
    is_torch_compile,
    numpy_cosine_similarity_distance,
@@ -1753,7 +1754,7 @@ class TorchCompileTesterMixin:
@require_peft_backend
@require_peft_version_greater("0.14.0")
@is_torch_compile
-class LoraHotSwappingForModelTesterMixin:
+class TestLoraHotSwappingForModel(unittest.TestCase):
    """Test that hotswapping does not result in recompilation on the model directly.

    We're not extensively testing the hotswapping functionality since it is implemented in PEFT and is extensively
@@ -1774,24 +1775,48 @@ class LoraHotSwappingForModelTesterMixin:
        gc.collect()
        backend_empty_cache(torch_device)

-    def get_lora_config(self, lora_rank, lora_alpha, target_modules):
+    def get_small_unet(self):
+        # from diffusers UNet2DConditionModelTests
+        torch.manual_seed(0)
+        init_dict = {
+            "block_out_channels": (4, 8),
+            "norm_num_groups": 4,
+            "down_block_types": ("CrossAttnDownBlock2D", "DownBlock2D"),
+            "up_block_types": ("UpBlock2D", "CrossAttnUpBlock2D"),
+            "cross_attention_dim": 8,
+            "attention_head_dim": 2,
+            "out_channels": 4,
+            "in_channels": 4,
+            "layers_per_block": 1,
+            "sample_size": 16,
+        }
+        model = UNet2DConditionModel(**init_dict)
+        return model.to(torch_device)
+
+    def get_unet_lora_config(self, lora_rank, lora_alpha, target_modules):
        # from diffusers test_models_unet_2d_condition.py
        from peft import LoraConfig

-        lora_config = LoraConfig(
+        unet_lora_config = LoraConfig(
            r=lora_rank,
            lora_alpha=lora_alpha,
            target_modules=target_modules,
            init_lora_weights=False,
            use_dora=False,
        )
-        return lora_config
+        return unet_lora_config

-    def get_linear_module_name_other_than_attn(self, model):
-        linear_names = [
-            name for name, module in model.named_modules() if isinstance(module, nn.Linear) and "to_" not in name
-        ]
-        return linear_names[0]
+    def get_dummy_input(self):
+        # from UNet2DConditionModelTests
+        batch_size = 4
+        num_channels = 4
+        sizes = (16, 16)
+
+        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+        time_step = torch.tensor([10]).to(torch_device)
+        encoder_hidden_states = floats_tensor((batch_size, 4, 8)).to(torch_device)
+
+        return {"sample": noise, "timestep": time_step, "encoder_hidden_states": encoder_hidden_states}

    def check_model_hotswap(self, do_compile, rank0, rank1, target_modules0, target_modules1=None):
        """
@@ -1809,27 +1834,23 @@ class LoraHotSwappingForModelTesterMixin:
        fine.
        """
        # create 2 adapters with different ranks and alphas
-        torch.manual_seed(0)
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).to(torch_device)
-
+        dummy_input = self.get_dummy_input()
        alpha0, alpha1 = rank0, rank1
        max_rank = max([rank0, rank1])
        if target_modules1 is None:
            target_modules1 = target_modules0[:]
-        lora_config0 = self.get_lora_config(rank0, alpha0, target_modules0)
-        lora_config1 = self.get_lora_config(rank1, alpha1, target_modules1)
+        lora_config0 = self.get_unet_lora_config(rank0, alpha0, target_modules0)
+        lora_config1 = self.get_unet_lora_config(rank1, alpha1, target_modules1)

-        model.add_adapter(lora_config0, adapter_name="adapter0")
+        unet = self.get_small_unet()
+        unet.add_adapter(lora_config0, adapter_name="adapter0")
        with torch.inference_mode():
-            torch.manual_seed(0)
-            output0_before = model(**inputs_dict)["sample"]
+            output0_before = unet(**dummy_input)["sample"]

-        model.add_adapter(lora_config1, adapter_name="adapter1")
-        model.set_adapter("adapter1")
+        unet.add_adapter(lora_config1, adapter_name="adapter1")
+        unet.set_adapter("adapter1")
        with torch.inference_mode():
-            torch.manual_seed(0)
-            output1_before = model(**inputs_dict)["sample"]
+            output1_before = unet(**dummy_input)["sample"]

        # sanity checks:
        tol = 5e-3
@@ -1839,43 +1860,40 @@ class LoraHotSwappingForModelTesterMixin:

        with tempfile.TemporaryDirectory() as tmp_dirname:
            # save the adapter checkpoints
-            model.save_lora_adapter(os.path.join(tmp_dirname, "0"), safe_serialization=True, adapter_name="adapter0")
-            model.save_lora_adapter(os.path.join(tmp_dirname, "1"), safe_serialization=True, adapter_name="adapter1")
-            del model
+            unet.save_lora_adapter(os.path.join(tmp_dirname, "0"), safe_serialization=True, adapter_name="adapter0")
+            unet.save_lora_adapter(os.path.join(tmp_dirname, "1"), safe_serialization=True, adapter_name="adapter1")
+            del unet

            # load the first adapter
-            torch.manual_seed(0)
-            init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-            model = self.model_class(**init_dict).to(torch_device)
-
+            unet = self.get_small_unet()
            if do_compile or (rank0 != rank1):
                # no need to prepare if the model is not compiled or if the ranks are identical
-                model.enable_lora_hotswap(target_rank=max_rank)
+                unet.enable_lora_hotswap(target_rank=max_rank)

            file_name0 = os.path.join(os.path.join(tmp_dirname, "0"), "pytorch_lora_weights.safetensors")
            file_name1 = os.path.join(os.path.join(tmp_dirname, "1"), "pytorch_lora_weights.safetensors")
-            model.load_lora_adapter(file_name0, safe_serialization=True, adapter_name="adapter0", prefix=None)
+            unet.load_lora_adapter(file_name0, safe_serialization=True, adapter_name="adapter0", prefix=None)

            if do_compile:
-                model = torch.compile(model, mode="reduce-overhead")
+                unet = torch.compile(unet, mode="reduce-overhead")

            with torch.inference_mode():
-                output0_after = model(**inputs_dict)["sample"]
+                output0_after = unet(**dummy_input)["sample"]
            assert torch.allclose(output0_before, output0_after, atol=tol, rtol=tol)

            # hotswap the 2nd adapter
-            model.load_lora_adapter(file_name1, adapter_name="adapter0", hotswap=True, prefix=None)
+            unet.load_lora_adapter(file_name1, adapter_name="adapter0", hotswap=True, prefix=None)

            # we need to call forward to potentially trigger recompilation
            with torch.inference_mode():
-                output1_after = model(**inputs_dict)["sample"]
+                output1_after = unet(**dummy_input)["sample"]
            assert torch.allclose(output1_before, output1_after, atol=tol, rtol=tol)

            # check error when not passing valid adapter name
            name = "does-not-exist"
            msg = f"Trying to hotswap LoRA adapter '{name}' but there is no existing adapter by that name"
            with self.assertRaisesRegex(ValueError, msg):
-                model.load_lora_adapter(file_name1, adapter_name=name, hotswap=True, prefix=None)
+                unet.load_lora_adapter(file_name1, adapter_name=name, hotswap=True, prefix=None)

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
    def test_hotswapping_model(self, rank0, rank1):
@@ -1892,9 +1910,6 @@ class LoraHotSwappingForModelTesterMixin:

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
    def test_hotswapping_compiled_model_conv2d(self, rank0, rank1):
-        if "unet" not in self.model_class.__name__.lower():
-            return
-
        # It's important to add this context to raise an error on recompilation
        target_modules = ["conv", "conv1", "conv2"]
        with torch._dynamo.config.patch(error_on_recompile=True):
@@ -1902,77 +1917,52 @@ class LoraHotSwappingForModelTesterMixin:

    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
    def test_hotswapping_compiled_model_both_linear_and_conv2d(self, rank0, rank1):
-        if "unet" not in self.model_class.__name__.lower():
-            return
-
        # It's important to add this context to raise an error on recompilation
        target_modules = ["to_q", "conv"]
        with torch._dynamo.config.patch(error_on_recompile=True):
            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)

-    @parameterized.expand([(11, 11), (7, 13), (13, 7)])  # important to test small to large and vice versa
-    def test_hotswapping_compiled_model_both_linear_and_other(self, rank0, rank1):
-        # In `test_hotswapping_compiled_model_both_linear_and_conv2d()`, we check if we can do hotswapping
-        # with `torch.compile()` for models that have both linear and conv layers. In this test, we check
-        # if we can target a linear layer from the transformer blocks and another linear layer from non-attention
-        # block.
-        target_modules = ["to_q"]
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-
-        target_modules.append(self.get_linear_module_name_other_than_attn(model))
-        del model
-
-        # It's important to add this context to raise an error on recompilation
-        with torch._dynamo.config.patch(error_on_recompile=True):
-            self.check_model_hotswap(do_compile=True, rank0=rank0, rank1=rank1, target_modules0=target_modules)
-
    def test_enable_lora_hotswap_called_after_adapter_added_raises(self):
        # ensure that enable_lora_hotswap is called before loading the first adapter
-        lora_config = self.get_lora_config(8, 8, target_modules=["to_q"])
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).to(torch_device)
-        model.add_adapter(lora_config)
-
+        lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"])
+        unet = self.get_small_unet()
+        unet.add_adapter(lora_config)
        msg = re.escape("Call `enable_lora_hotswap` before loading the first adapter.")
        with self.assertRaisesRegex(RuntimeError, msg):
-            model.enable_lora_hotswap(target_rank=32)
+            unet.enable_lora_hotswap(target_rank=32)

    def test_enable_lora_hotswap_called_after_adapter_added_warning(self):
        # ensure that enable_lora_hotswap is called before loading the first adapter
        from diffusers.loaders.peft import logger

-        lora_config = self.get_lora_config(8, 8, target_modules=["to_q"])
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).to(torch_device)
-        model.add_adapter(lora_config)
+        lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"])
+        unet = self.get_small_unet()
+        unet.add_adapter(lora_config)
        msg = (
            "It is recommended to call `enable_lora_hotswap` before loading the first adapter to avoid recompilation."
        )
        with self.assertLogs(logger=logger, level="WARNING") as cm:
-            model.enable_lora_hotswap(target_rank=32, check_compiled="warn")
+            unet.enable_lora_hotswap(target_rank=32, check_compiled="warn")
            assert any(msg in log for log in cm.output)

    def test_enable_lora_hotswap_called_after_adapter_added_ignore(self):
        # check possibility to ignore the error/warning
-        lora_config = self.get_lora_config(8, 8, target_modules=["to_q"])
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).to(torch_device)
-        model.add_adapter(lora_config)
+        lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"])
+        unet = self.get_small_unet()
+        unet.add_adapter(lora_config)
        with warnings.catch_warnings(record=True) as w:
            warnings.simplefilter("always")  # Capture all warnings
-            model.enable_lora_hotswap(target_rank=32, check_compiled="warn")
+            unet.enable_lora_hotswap(target_rank=32, check_compiled="warn")
            self.assertEqual(len(w), 0, f"Expected no warnings, but got: {[str(warn.message) for warn in w]}")

    def test_enable_lora_hotswap_wrong_check_compiled_argument_raises(self):
        # check that wrong argument value raises an error
-        lora_config = self.get_lora_config(8, 8, target_modules=["to_q"])
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict).to(torch_device)
-        model.add_adapter(lora_config)
+        lora_config = self.get_unet_lora_config(8, 8, target_modules=["to_q"])
+        unet = self.get_small_unet()
+        unet.add_adapter(lora_config)
        msg = re.escape("check_compiles should be one of 'error', 'warn', or 'ignore', got 'wrong-argument' instead.")
        with self.assertRaisesRegex(ValueError, msg):
-            model.enable_lora_hotswap(target_rank=32, check_compiled="wrong-argument")
+            unet.enable_lora_hotswap(target_rank=32, check_compiled="wrong-argument")

    def test_hotswap_second_adapter_targets_more_layers_raises(self):
        # check the error and log
@@ -1,153 +0,0 @@
-# Copyright 2024 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-
-import torch
-
-from diffusers import CosmosTransformer3DModel
-from diffusers.utils.testing_utils import enable_full_determinism, torch_device
-
-from ..test_modeling_common import ModelTesterMixin
-
-
-enable_full_determinism()
-
-
-class CosmosTransformer3DModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = CosmosTransformer3DModel
-    main_input_name = "hidden_states"
-    uses_custom_attn_processor = True
-
-    @property
-    def dummy_input(self):
-        batch_size = 1
-        num_channels = 4
-        num_frames = 1
-        height = 16
-        width = 16
-        text_embed_dim = 16
-        sequence_length = 12
-        fps = 30
-
-        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_embed_dim)).to(torch_device)
-        attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device)
-        padding_mask = torch.zeros(batch_size, 1, height, width).to(torch_device)
-
-        return {
-            "hidden_states": hidden_states,
-            "timestep": timestep,
-            "encoder_hidden_states": encoder_hidden_states,
-            "attention_mask": attention_mask,
-            "fps": fps,
-            "padding_mask": padding_mask,
-        }
-
-    @property
-    def input_shape(self):
-        return (4, 1, 16, 16)
-
-    @property
-    def output_shape(self):
-        return (4, 1, 16, 16)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "in_channels": 4,
-            "out_channels": 4,
-            "num_attention_heads": 2,
-            "attention_head_dim": 12,
-            "num_layers": 2,
-            "mlp_ratio": 2,
-            "text_embed_dim": 16,
-            "adaln_lora_dim": 4,
-            "max_size": (4, 32, 32),
-            "patch_size": (1, 2, 2),
-            "rope_scale": (2.0, 1.0, 1.0),
-            "concat_padding_mask": True,
-            "extra_pos_embed_type": "learnable",
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"CosmosTransformer3DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
-
-
-class CosmosTransformer3DModelVideoToWorldTests(ModelTesterMixin, unittest.TestCase):
-    model_class = CosmosTransformer3DModel
-    main_input_name = "hidden_states"
-    uses_custom_attn_processor = True
-
-    @property
-    def dummy_input(self):
-        batch_size = 1
-        num_channels = 4
-        num_frames = 1
-        height = 16
-        width = 16
-        text_embed_dim = 16
-        sequence_length = 12
-        fps = 30
-
-        hidden_states = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
-        timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
-        encoder_hidden_states = torch.randn((batch_size, sequence_length, text_embed_dim)).to(torch_device)
-        attention_mask = torch.ones((batch_size, sequence_length)).to(torch_device)
-        condition_mask = torch.ones(batch_size, 1, num_frames, height, width).to(torch_device)
-        padding_mask = torch.zeros(batch_size, 1, height, width).to(torch_device)
-
-        return {
-            "hidden_states": hidden_states,
-            "timestep": timestep,
-            "encoder_hidden_states": encoder_hidden_states,
-            "attention_mask": attention_mask,
-            "fps": fps,
-            "condition_mask": condition_mask,
-            "padding_mask": padding_mask,
-        }
-
-    @property
-    def input_shape(self):
-        return (4, 1, 16, 16)
-
-    @property
-    def output_shape(self):
-        return (4, 1, 16, 16)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "in_channels": 4 + 1,
-            "out_channels": 4,
-            "num_attention_heads": 2,
-            "attention_head_dim": 12,
-            "num_layers": 2,
-            "mlp_ratio": 2,
-            "text_embed_dim": 16,
-            "adaln_lora_dim": 4,
-            "max_size": (4, 32, 32),
-            "patch_size": (1, 2, 2),
-            "rope_scale": (2.0, 1.0, 1.0),
-            "concat_padding_mask": True,
-            "extra_pos_embed_type": "learnable",
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_gradient_checkpointing_is_applied(self):
-        expected_set = {"CosmosTransformer3DModel"}
-        super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
@@ -22,7 +22,7 @@ from diffusers.models.attention_processor import FluxIPAdapterJointAttnProcessor
 from diffusers.models.embeddings import ImageProjection
 from diffusers.utils.testing_utils import enable_full_determinism, torch_device

-from ..test_modeling_common import LoraHotSwappingForModelTesterMixin, ModelTesterMixin, TorchCompileTesterMixin
+from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin


 enable_full_determinism()
@@ -78,9 +78,7 @@ def create_flux_ip_adapter_state_dict(model):
    return ip_state_dict


-class FluxTransformerTests(
-    ModelTesterMixin, TorchCompileTesterMixin, LoraHotSwappingForModelTesterMixin, unittest.TestCase
-):
+class FluxTransformerTests(ModelTesterMixin, TorchCompileTesterMixin, unittest.TestCase):
    model_class = FluxTransformer2DModel
    main_input_name = "hidden_states"
    # We override the items here because the transformer under consideration is small.
@@ -53,7 +53,7 @@ from diffusers.utils.testing_utils import (
    torch_device,
 )

-from ..test_modeling_common import LoraHotSwappingForModelTesterMixin, ModelTesterMixin, UNetTesterMixin
+from ..test_modeling_common import ModelTesterMixin, UNetTesterMixin


 if is_peft_available():
@@ -350,9 +350,7 @@ def create_custom_diffusion_layers(model, mock_weights: bool = True):
    return custom_diffusion_attn_procs


-class UNet2DConditionModelTests(
-    ModelTesterMixin, LoraHotSwappingForModelTesterMixin, UNetTesterMixin, unittest.TestCase
-):
+class UNet2DConditionModelTests(ModelTesterMixin, UNetTesterMixin, unittest.TestCase):
    model_class = UNet2DConditionModel
    main_input_name = "sample"
    # We override the items here because the unet under consideration is small.
@@ -1,47 +0,0 @@
-# Copyright 2024 The HuggingFace Team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# ===== This file is an implementation of a dummy guardrail for the fast tests =====
-
-from typing import Union
-
-import numpy as np
-import torch
-
-from diffusers.configuration_utils import ConfigMixin
-from diffusers.models.modeling_utils import ModelMixin
-
-
-class DummyCosmosSafetyChecker(ModelMixin, ConfigMixin):
-    def __init__(self) -> None:
-        super().__init__()
-
-        self._dtype = torch.float32
-
-    def check_text_safety(self, prompt: str) -> bool:
-        return True
-
-    def check_video_safety(self, frames: np.ndarray) -> np.ndarray:
-        return frames
-
-    def to(self, device: Union[str, torch.device] = None, dtype: torch.dtype = None) -> None:
-        self._dtype = dtype
-
-    @property
-    def device(self) -> torch.device:
-        return None
-
-    @property
-    def dtype(self) -> torch.dtype:
-        return self._dtype
@@ -1,350 +0,0 @@
-# Copyright 2024 The HuggingFace Team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import json
-import os
-import tempfile
-import unittest
-
-import numpy as np
-import torch
-from transformers import AutoTokenizer, T5EncoderModel
-
-from diffusers import AutoencoderKLCosmos, CosmosTextToWorldPipeline, CosmosTransformer3DModel, EDMEulerScheduler
-from diffusers.utils.testing_utils import enable_full_determinism, torch_device
-
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin, to_np
-from .cosmos_guardrail import DummyCosmosSafetyChecker
-
-
-enable_full_determinism()
-
-
-class CosmosTextToWorldPipelineWrapper(CosmosTextToWorldPipeline):
-    @staticmethod
-    def from_pretrained(*args, **kwargs):
-        kwargs["safety_checker"] = DummyCosmosSafetyChecker()
-        return CosmosTextToWorldPipeline.from_pretrained(*args, **kwargs)
-
-
-class CosmosTextToWorldPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = CosmosTextToWorldPipelineWrapper
-    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS
-    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
-    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "generator",
-            "latents",
-            "return_dict",
-            "callback_on_step_end",
-            "callback_on_step_end_tensor_inputs",
-        ]
-    )
-    supports_dduf = False
-    test_xformers_attention = False
-    test_layerwise_casting = True
-    test_group_offloading = True
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        transformer = CosmosTransformer3DModel(
-            in_channels=4,
-            out_channels=4,
-            num_attention_heads=2,
-            attention_head_dim=16,
-            num_layers=2,
-            mlp_ratio=2,
-            text_embed_dim=32,
-            adaln_lora_dim=4,
-            max_size=(4, 32, 32),
-            patch_size=(1, 2, 2),
-            rope_scale=(2.0, 1.0, 1.0),
-            concat_padding_mask=True,
-            extra_pos_embed_type="learnable",
-        )
-
-        torch.manual_seed(0)
-        vae = AutoencoderKLCosmos(
-            in_channels=3,
-            out_channels=3,
-            latent_channels=4,
-            encoder_block_out_channels=(8, 8, 8, 8),
-            decode_block_out_channels=(8, 8, 8, 8),
-            attention_resolutions=(8,),
-            resolution=64,
-            num_layers=2,
-            patch_size=4,
-            patch_type="haar",
-            scaling_factor=1.0,
-            spatial_compression_ratio=4,
-            temporal_compression_ratio=4,
-        )
-
-        torch.manual_seed(0)
-        scheduler = EDMEulerScheduler(
-            sigma_min=0.002,
-            sigma_max=80,
-            sigma_data=0.5,
-            sigma_schedule="karras",
-            num_train_timesteps=1000,
-            prediction_type="epsilon",
-            rho=7.0,
-            final_sigmas_type="sigma_min",
-        )
-        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
-
-        components = {
-            "transformer": transformer,
-            "vae": vae,
-            "scheduler": scheduler,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            # We cannot run the Cosmos Guardrail for fast tests due to the large model size
-            "safety_checker": DummyCosmosSafetyChecker(),
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-
-        inputs = {
-            "prompt": "dance monkey",
-            "negative_prompt": "bad quality",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 3.0,
-            "height": 32,
-            "width": 32,
-            "num_frames": 9,
-            "max_sequence_length": 16,
-            "output_type": "pt",
-        }
-
-        return inputs
-
-    def test_inference(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        video = pipe(**inputs).frames
-        generated_video = video[0]
-
-        self.assertEqual(generated_video.shape, (9, 3, 32, 32))
-        expected_video = torch.randn(9, 3, 32, 32)
-        max_diff = np.abs(generated_video - expected_video).max()
-        self.assertLessEqual(max_diff, 1e10)
-
-    def test_callback_inputs(self):
-        sig = inspect.signature(self.pipeline_class.__call__)
-        has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
-        has_callback_step_end = "callback_on_step_end" in sig.parameters
-
-        if not (has_callback_tensor_inputs and has_callback_step_end):
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        self.assertTrue(
-            hasattr(pipe, "_callback_tensor_inputs"),
-            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
-        )
-
-        def callback_inputs_subset(pipe, i, t, callback_kwargs):
-            # iterate over callback args
-            for tensor_name, tensor_value in callback_kwargs.items():
-                # check that we're only passing in allowed tensor inputs
-                assert tensor_name in pipe._callback_tensor_inputs
-
-            return callback_kwargs
-
-        def callback_inputs_all(pipe, i, t, callback_kwargs):
-            for tensor_name in pipe._callback_tensor_inputs:
-                assert tensor_name in callback_kwargs
-
-            # iterate over callback args
-            for tensor_name, tensor_value in callback_kwargs.items():
-                # check that we're only passing in allowed tensor inputs
-                assert tensor_name in pipe._callback_tensor_inputs
-
-            return callback_kwargs
-
-        inputs = self.get_dummy_inputs(torch_device)
-
-        # Test passing in a subset
-        inputs["callback_on_step_end"] = callback_inputs_subset
-        inputs["callback_on_step_end_tensor_inputs"] = ["latents"]
-        output = pipe(**inputs)[0]
-
-        # Test passing in a everything
-        inputs["callback_on_step_end"] = callback_inputs_all
-        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
-        output = pipe(**inputs)[0]
-
-        def callback_inputs_change_tensor(pipe, i, t, callback_kwargs):
-            is_last = i == (pipe.num_timesteps - 1)
-            if is_last:
-                callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
-            return callback_kwargs
-
-        inputs["callback_on_step_end"] = callback_inputs_change_tensor
-        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
-        output = pipe(**inputs)[0]
-        assert output.abs().sum() < 1e10
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(batch_size=3, expected_max_diff=1e-2)
-
-    def test_attention_slicing_forward_pass(
-        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
-    ):
-        if not self.test_attention_slicing:
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        for component in pipe.components.values():
-            if hasattr(component, "set_default_attn_processor"):
-                component.set_default_attn_processor()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator_device = "cpu"
-        inputs = self.get_dummy_inputs(generator_device)
-        output_without_slicing = pipe(**inputs)[0]
-
-        pipe.enable_attention_slicing(slice_size=1)
-        inputs = self.get_dummy_inputs(generator_device)
-        output_with_slicing1 = pipe(**inputs)[0]
-
-        pipe.enable_attention_slicing(slice_size=2)
-        inputs = self.get_dummy_inputs(generator_device)
-        output_with_slicing2 = pipe(**inputs)[0]
-
-        if test_max_difference:
-            max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
-            max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
-            self.assertLess(
-                max(max_diff1, max_diff2),
-                expected_max_diff,
-                "Attention slicing should not affect the inference results",
-            )
-
-    def test_vae_tiling(self, expected_diff_max: float = 0.2):
-        generator_device = "cpu"
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe.to("cpu")
-        pipe.set_progress_bar_config(disable=None)
-
-        # Without tiling
-        inputs = self.get_dummy_inputs(generator_device)
-        inputs["height"] = inputs["width"] = 128
-        output_without_tiling = pipe(**inputs)[0]
-
-        # With tiling
-        pipe.vae.enable_tiling(
-            tile_sample_min_height=96,
-            tile_sample_min_width=96,
-            tile_sample_stride_height=64,
-            tile_sample_stride_width=64,
-        )
-        inputs = self.get_dummy_inputs(generator_device)
-        inputs["height"] = inputs["width"] = 128
-        output_with_tiling = pipe(**inputs)[0]
-
-        self.assertLess(
-            (to_np(output_without_tiling) - to_np(output_with_tiling)).max(),
-            expected_diff_max,
-            "VAE tiling should not affect the inference results",
-        )
-
-    def test_save_load_optional_components(self, expected_max_difference=1e-4):
-        self.pipeline_class._optional_components.remove("safety_checker")
-        super().test_save_load_optional_components(expected_max_difference=expected_max_difference)
-        self.pipeline_class._optional_components.append("safety_checker")
-
-    def test_serialization_with_variants(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        model_components = [
-            component_name
-            for component_name, component in pipe.components.items()
-            if isinstance(component, torch.nn.Module)
-        ]
-        model_components.remove("safety_checker")
-        variant = "fp16"
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir, variant=variant, safe_serialization=False)
-
-            with open(f"{tmpdir}/model_index.json", "r") as f:
-                config = json.load(f)
-
-            for subfolder in os.listdir(tmpdir):
-                if not os.path.isfile(subfolder) and subfolder in model_components:
-                    folder_path = os.path.join(tmpdir, subfolder)
-                    is_folder = os.path.isdir(folder_path) and subfolder in config
-                    assert is_folder and any(p.split(".")[1].startswith(variant) for p in os.listdir(folder_path))
-
-    def test_torch_dtype_dict(self):
-        components = self.get_dummy_components()
-        if not components:
-            self.skipTest("No dummy components defined.")
-
-        pipe = self.pipeline_class(**components)
-
-        specified_key = next(iter(components.keys()))
-
-        with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdirname:
-            pipe.save_pretrained(tmpdirname, safe_serialization=False)
-            torch_dtype_dict = {specified_key: torch.bfloat16, "default": torch.float16}
-            loaded_pipe = self.pipeline_class.from_pretrained(
-                tmpdirname, safety_checker=DummyCosmosSafetyChecker(), torch_dtype=torch_dtype_dict
-            )
-
-        for name, component in loaded_pipe.components.items():
-            if name == "safety_checker":
-                continue
-            if isinstance(component, torch.nn.Module) and hasattr(component, "dtype"):
-                expected_dtype = torch_dtype_dict.get(name, torch_dtype_dict.get("default", torch.float32))
-                self.assertEqual(
-                    component.dtype,
-                    expected_dtype,
-                    f"Component '{name}' has dtype {component.dtype} but expected {expected_dtype}",
-                )
-
-    @unittest.skip(
-        "The pipeline should not be runnable without a safety checker. The test creates a pipeline without passing in "
-        "a safety checker, which makes the pipeline default to the actual Cosmos Guardrail. The Cosmos Guardrail is "
-        "too large and slow to run on CI."
-    )
-    def test_encode_prompt_works_in_isolation(self):
-        pass
@@ -1,363 +0,0 @@
-# Copyright 2024 The HuggingFace Team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import json
-import os
-import tempfile
-import unittest
-
-import numpy as np
-import PIL.Image
-import torch
-from transformers import AutoTokenizer, T5EncoderModel
-
-from diffusers import AutoencoderKLCosmos, CosmosTransformer3DModel, CosmosVideoToWorldPipeline, EDMEulerScheduler
-from diffusers.utils.testing_utils import enable_full_determinism, torch_device
-
-from ..pipeline_params import TEXT_TO_IMAGE_BATCH_PARAMS, TEXT_TO_IMAGE_IMAGE_PARAMS, TEXT_TO_IMAGE_PARAMS
-from ..test_pipelines_common import PipelineTesterMixin, to_np
-from .cosmos_guardrail import DummyCosmosSafetyChecker
-
-
-enable_full_determinism()
-
-
-class CosmosVideoToWorldPipelineWrapper(CosmosVideoToWorldPipeline):
-    @staticmethod
-    def from_pretrained(*args, **kwargs):
-        kwargs["safety_checker"] = DummyCosmosSafetyChecker()
-        return CosmosVideoToWorldPipeline.from_pretrained(*args, **kwargs)
-
-
-class CosmosVideoToWorldPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
-    pipeline_class = CosmosVideoToWorldPipelineWrapper
-    params = TEXT_TO_IMAGE_PARAMS - {"cross_attention_kwargs"}
-    batch_params = TEXT_TO_IMAGE_BATCH_PARAMS.union({"image", "video"})
-    image_params = TEXT_TO_IMAGE_IMAGE_PARAMS
-    image_latents_params = TEXT_TO_IMAGE_IMAGE_PARAMS
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "generator",
-            "latents",
-            "return_dict",
-            "callback_on_step_end",
-            "callback_on_step_end_tensor_inputs",
-        ]
-    )
-    supports_dduf = False
-    test_xformers_attention = False
-    test_layerwise_casting = True
-    test_group_offloading = True
-
-    def get_dummy_components(self):
-        torch.manual_seed(0)
-        transformer = CosmosTransformer3DModel(
-            in_channels=4 + 1,
-            out_channels=4,
-            num_attention_heads=2,
-            attention_head_dim=16,
-            num_layers=2,
-            mlp_ratio=2,
-            text_embed_dim=32,
-            adaln_lora_dim=4,
-            max_size=(4, 32, 32),
-            patch_size=(1, 2, 2),
-            rope_scale=(2.0, 1.0, 1.0),
-            concat_padding_mask=True,
-            extra_pos_embed_type="learnable",
-        )
-
-        torch.manual_seed(0)
-        vae = AutoencoderKLCosmos(
-            in_channels=3,
-            out_channels=3,
-            latent_channels=4,
-            encoder_block_out_channels=(8, 8, 8, 8),
-            decode_block_out_channels=(8, 8, 8, 8),
-            attention_resolutions=(8,),
-            resolution=64,
-            num_layers=2,
-            patch_size=4,
-            patch_type="haar",
-            scaling_factor=1.0,
-            spatial_compression_ratio=4,
-            temporal_compression_ratio=4,
-        )
-
-        torch.manual_seed(0)
-        scheduler = EDMEulerScheduler(
-            sigma_min=0.002,
-            sigma_max=80,
-            sigma_data=0.5,
-            sigma_schedule="karras",
-            num_train_timesteps=1000,
-            prediction_type="epsilon",
-            rho=7.0,
-            final_sigmas_type="sigma_min",
-        )
-        text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
-        tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
-
-        components = {
-            "transformer": transformer,
-            "vae": vae,
-            "scheduler": scheduler,
-            "text_encoder": text_encoder,
-            "tokenizer": tokenizer,
-            # We cannot run the Cosmos Guardrail for fast tests due to the large model size
-            "safety_checker": DummyCosmosSafetyChecker(),
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-
-        image_height = 32
-        image_width = 32
-        image = PIL.Image.new("RGB", (image_width, image_height))
-
-        inputs = {
-            "image": image,
-            "prompt": "dance monkey",
-            "negative_prompt": "bad quality",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 3.0,
-            "height": image_height,
-            "width": image_width,
-            "num_frames": 9,
-            "max_sequence_length": 16,
-            "output_type": "pt",
-        }
-
-        return inputs
-
-    def test_inference(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        video = pipe(**inputs).frames
-        generated_video = video[0]
-
-        self.assertEqual(generated_video.shape, (9, 3, 32, 32))
-        expected_video = torch.randn(9, 3, 32, 32)
-        max_diff = np.abs(generated_video - expected_video).max()
-        self.assertLessEqual(max_diff, 1e10)
-
-    def test_components_function(self):
-        init_components = self.get_dummy_components()
-        init_components = {k: v for k, v in init_components.items() if not isinstance(v, (str, int, float))}
-        pipe = self.pipeline_class(**init_components)
-        self.assertTrue(hasattr(pipe, "components"))
-        self.assertTrue(set(pipe.components.keys()) == set(init_components.keys()))
-
-    def test_callback_inputs(self):
-        sig = inspect.signature(self.pipeline_class.__call__)
-        has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
-        has_callback_step_end = "callback_on_step_end" in sig.parameters
-
-        if not (has_callback_tensor_inputs and has_callback_step_end):
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        self.assertTrue(
-            hasattr(pipe, "_callback_tensor_inputs"),
-            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
-        )
-
-        def callback_inputs_subset(pipe, i, t, callback_kwargs):
-            # iterate over callback args
-            for tensor_name, tensor_value in callback_kwargs.items():
-                # check that we're only passing in allowed tensor inputs
-                assert tensor_name in pipe._callback_tensor_inputs
-
-            return callback_kwargs
-
-        def callback_inputs_all(pipe, i, t, callback_kwargs):
-            for tensor_name in pipe._callback_tensor_inputs:
-                assert tensor_name in callback_kwargs
-
-            # iterate over callback args
-            for tensor_name, tensor_value in callback_kwargs.items():
-                # check that we're only passing in allowed tensor inputs
-                assert tensor_name in pipe._callback_tensor_inputs
-
-            return callback_kwargs
-
-        inputs = self.get_dummy_inputs(torch_device)
-
-        # Test passing in a subset
-        inputs["callback_on_step_end"] = callback_inputs_subset
-        inputs["callback_on_step_end_tensor_inputs"] = ["latents"]
-        output = pipe(**inputs)[0]
-
-        # Test passing in a everything
-        inputs["callback_on_step_end"] = callback_inputs_all
-        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
-        output = pipe(**inputs)[0]
-
-        def callback_inputs_change_tensor(pipe, i, t, callback_kwargs):
-            is_last = i == (pipe.num_timesteps - 1)
-            if is_last:
-                callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
-            return callback_kwargs
-
-        inputs["callback_on_step_end"] = callback_inputs_change_tensor
-        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
-        output = pipe(**inputs)[0]
-        assert output.abs().sum() < 1e10
-
-    def test_inference_batch_single_identical(self):
-        self._test_inference_batch_single_identical(batch_size=3, expected_max_diff=1e-2)
-
-    def test_attention_slicing_forward_pass(
-        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
-    ):
-        if not self.test_attention_slicing:
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        for component in pipe.components.values():
-            if hasattr(component, "set_default_attn_processor"):
-                component.set_default_attn_processor()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator_device = "cpu"
-        inputs = self.get_dummy_inputs(generator_device)
-        output_without_slicing = pipe(**inputs)[0]
-
-        pipe.enable_attention_slicing(slice_size=1)
-        inputs = self.get_dummy_inputs(generator_device)
-        output_with_slicing1 = pipe(**inputs)[0]
-
-        pipe.enable_attention_slicing(slice_size=2)
-        inputs = self.get_dummy_inputs(generator_device)
-        output_with_slicing2 = pipe(**inputs)[0]
-
-        if test_max_difference:
-            max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
-            max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
-            self.assertLess(
-                max(max_diff1, max_diff2),
-                expected_max_diff,
-                "Attention slicing should not affect the inference results",
-            )
-
-    def test_vae_tiling(self, expected_diff_max: float = 0.2):
-        generator_device = "cpu"
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe.to("cpu")
-        pipe.set_progress_bar_config(disable=None)
-
-        # Without tiling
-        inputs = self.get_dummy_inputs(generator_device)
-        inputs["height"] = inputs["width"] = 128
-        output_without_tiling = pipe(**inputs)[0]
-
-        # With tiling
-        pipe.vae.enable_tiling(
-            tile_sample_min_height=96,
-            tile_sample_min_width=96,
-            tile_sample_stride_height=64,
-            tile_sample_stride_width=64,
-        )
-        inputs = self.get_dummy_inputs(generator_device)
-        inputs["height"] = inputs["width"] = 128
-        output_with_tiling = pipe(**inputs)[0]
-
-        self.assertLess(
-            (to_np(output_without_tiling) - to_np(output_with_tiling)).max(),
-            expected_diff_max,
-            "VAE tiling should not affect the inference results",
-        )
-
-    def test_save_load_optional_components(self, expected_max_difference=1e-4):
-        self.pipeline_class._optional_components.remove("safety_checker")
-        super().test_save_load_optional_components(expected_max_difference=expected_max_difference)
-        self.pipeline_class._optional_components.append("safety_checker")
-
-    def test_serialization_with_variants(self):
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        model_components = [
-            component_name
-            for component_name, component in pipe.components.items()
-            if isinstance(component, torch.nn.Module)
-        ]
-        model_components.remove("safety_checker")
-        variant = "fp16"
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir, variant=variant, safe_serialization=False)
-
-            with open(f"{tmpdir}/model_index.json", "r") as f:
-                config = json.load(f)
-
-            for subfolder in os.listdir(tmpdir):
-                if not os.path.isfile(subfolder) and subfolder in model_components:
-                    folder_path = os.path.join(tmpdir, subfolder)
-                    is_folder = os.path.isdir(folder_path) and subfolder in config
-                    assert is_folder and any(p.split(".")[1].startswith(variant) for p in os.listdir(folder_path))
-
-    def test_torch_dtype_dict(self):
-        components = self.get_dummy_components()
-        if not components:
-            self.skipTest("No dummy components defined.")
-
-        pipe = self.pipeline_class(**components)
-
-        specified_key = next(iter(components.keys()))
-
-        with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdirname:
-            pipe.save_pretrained(tmpdirname, safe_serialization=False)
-            torch_dtype_dict = {specified_key: torch.bfloat16, "default": torch.float16}
-            loaded_pipe = self.pipeline_class.from_pretrained(
-                tmpdirname, safety_checker=DummyCosmosSafetyChecker(), torch_dtype=torch_dtype_dict
-            )
-
-        for name, component in loaded_pipe.components.items():
-            if name == "safety_checker":
-                continue
-            if isinstance(component, torch.nn.Module) and hasattr(component, "dtype"):
-                expected_dtype = torch_dtype_dict.get(name, torch_dtype_dict.get("default", torch.float32))
-                self.assertEqual(
-                    component.dtype,
-                    expected_dtype,
-                    f"Component '{name}' has dtype {component.dtype} but expected {expected_dtype}",
-                )
-
-    @unittest.skip(
-        "The pipeline should not be runnable without a safety checker. The test creates a pipeline without passing in "
-        "a safety checker, which makes the pipeline default to the actual Cosmos Guardrail. The Cosmos Guardrail is "
-        "too large and slow to run on CI."
-    )
-    def test_encode_prompt_works_in_isolation(self):
-        pass
@@ -1,374 +0,0 @@
-# Copyright 2025 The HuggingFace Team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import inspect
-import unittest
-
-import numpy as np
-import torch
-from PIL import Image
-from transformers import (
-    CLIPTextConfig,
-    CLIPTextModel,
-    CLIPTokenizer,
-    LlamaConfig,
-    LlamaModel,
-    LlamaTokenizer,
-    SiglipImageProcessor,
-    SiglipVisionModel,
-)
-
-from diffusers import (
-    AutoencoderKLHunyuanVideo,
-    FasterCacheConfig,
-    FlowMatchEulerDiscreteScheduler,
-    HunyuanVideoFramepackPipeline,
-    HunyuanVideoFramepackTransformer3DModel,
-)
-from diffusers.utils.testing_utils import (
-    enable_full_determinism,
-    torch_device,
-)
-
-from ..test_pipelines_common import (
-    FasterCacheTesterMixin,
-    PipelineTesterMixin,
-    PyramidAttentionBroadcastTesterMixin,
-    to_np,
-)
-
-
-enable_full_determinism()
-
-
-class HunyuanVideoFramepackPipelineFastTests(
-    PipelineTesterMixin, PyramidAttentionBroadcastTesterMixin, FasterCacheTesterMixin, unittest.TestCase
-):
-    pipeline_class = HunyuanVideoFramepackPipeline
-    params = frozenset(
-        ["image", "prompt", "height", "width", "guidance_scale", "prompt_embeds", "pooled_prompt_embeds"]
-    )
-    batch_params = frozenset(["image", "prompt"])
-    required_optional_params = frozenset(
-        [
-            "num_inference_steps",
-            "generator",
-            "return_dict",
-            "callback_on_step_end",
-            "callback_on_step_end_tensor_inputs",
-        ]
-    )
-
-    supports_dduf = False
-    # there is no xformers processor for Flux
-    test_xformers_attention = False
-    test_layerwise_casting = True
-    test_group_offloading = True
-
-    faster_cache_config = FasterCacheConfig(
-        spatial_attention_block_skip_range=2,
-        spatial_attention_timestep_skip_range=(-1, 901),
-        unconditional_batch_skip_range=2,
-        attention_weight_callback=lambda _: 0.5,
-        is_guidance_distilled=True,
-    )
-
-    def get_dummy_components(self, num_layers: int = 1, num_single_layers: int = 1):
-        torch.manual_seed(0)
-        transformer = HunyuanVideoFramepackTransformer3DModel(
-            in_channels=4,
-            out_channels=4,
-            num_attention_heads=2,
-            attention_head_dim=10,
-            num_layers=num_layers,
-            num_single_layers=num_single_layers,
-            num_refiner_layers=1,
-            patch_size=2,
-            patch_size_t=1,
-            guidance_embeds=True,
-            text_embed_dim=16,
-            pooled_projection_dim=8,
-            rope_axes_dim=(2, 4, 4),
-            image_condition_type=None,
-            has_image_proj=True,
-            image_proj_dim=32,
-            has_clean_x_embedder=True,
-        )
-
-        torch.manual_seed(0)
-        vae = AutoencoderKLHunyuanVideo(
-            in_channels=3,
-            out_channels=3,
-            latent_channels=4,
-            down_block_types=(
-                "HunyuanVideoDownBlock3D",
-                "HunyuanVideoDownBlock3D",
-                "HunyuanVideoDownBlock3D",
-                "HunyuanVideoDownBlock3D",
-            ),
-            up_block_types=(
-                "HunyuanVideoUpBlock3D",
-                "HunyuanVideoUpBlock3D",
-                "HunyuanVideoUpBlock3D",
-                "HunyuanVideoUpBlock3D",
-            ),
-            block_out_channels=(8, 8, 8, 8),
-            layers_per_block=1,
-            act_fn="silu",
-            norm_num_groups=4,
-            scaling_factor=0.476986,
-            spatial_compression_ratio=8,
-            temporal_compression_ratio=4,
-            mid_block_add_attention=True,
-        )
-
-        torch.manual_seed(0)
-        scheduler = FlowMatchEulerDiscreteScheduler(shift=7.0)
-
-        llama_text_encoder_config = LlamaConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=16,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=2,
-            pad_token_id=1,
-            vocab_size=1000,
-            hidden_act="gelu",
-            projection_dim=32,
-        )
-        clip_text_encoder_config = CLIPTextConfig(
-            bos_token_id=0,
-            eos_token_id=2,
-            hidden_size=8,
-            intermediate_size=37,
-            layer_norm_eps=1e-05,
-            num_attention_heads=4,
-            num_hidden_layers=2,
-            pad_token_id=1,
-            vocab_size=1000,
-            hidden_act="gelu",
-            projection_dim=32,
-        )
-
-        torch.manual_seed(0)
-        text_encoder = LlamaModel(llama_text_encoder_config)
-        tokenizer = LlamaTokenizer.from_pretrained("finetrainers/dummy-hunyaunvideo", subfolder="tokenizer")
-
-        torch.manual_seed(0)
-        text_encoder_2 = CLIPTextModel(clip_text_encoder_config)
-        tokenizer_2 = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
-
-        feature_extractor = SiglipImageProcessor.from_pretrained(
-            "hf-internal-testing/tiny-random-SiglipVisionModel", size={"height": 30, "width": 30}
-        )
-        image_encoder = SiglipVisionModel.from_pretrained("hf-internal-testing/tiny-random-SiglipVisionModel")
-
-        components = {
-            "transformer": transformer,
-            "vae": vae,
-            "scheduler": scheduler,
-            "text_encoder": text_encoder,
-            "text_encoder_2": text_encoder_2,
-            "tokenizer": tokenizer,
-            "tokenizer_2": tokenizer_2,
-            "feature_extractor": feature_extractor,
-            "image_encoder": image_encoder,
-        }
-        return components
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-
-        image_height = 32
-        image_width = 32
-        image = Image.new("RGB", (image_width, image_height))
-        inputs = {
-            "image": image,
-            "prompt": "dance monkey",
-            "prompt_template": {
-                "template": "{}",
-                "crop_start": 0,
-            },
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 4.5,
-            "height": image_height,
-            "width": image_width,
-            "num_frames": 9,
-            "latent_window_size": 3,
-            "max_sequence_length": 256,
-            "output_type": "pt",
-        }
-        return inputs
-
-    def test_inference(self):
-        device = "cpu"
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe.to(device)
-        pipe.set_progress_bar_config(disable=None)
-
-        inputs = self.get_dummy_inputs(device)
-        video = pipe(**inputs).frames
-        generated_video = video[0]
-
-        self.assertEqual(generated_video.shape, (13, 3, 32, 32))
-        expected_video = torch.randn(13, 3, 32, 32)
-        max_diff = np.abs(generated_video - expected_video).max()
-        self.assertLessEqual(max_diff, 1e10)
-
-    def test_callback_inputs(self):
-        sig = inspect.signature(self.pipeline_class.__call__)
-        has_callback_tensor_inputs = "callback_on_step_end_tensor_inputs" in sig.parameters
-        has_callback_step_end = "callback_on_step_end" in sig.parameters
-
-        if not (has_callback_tensor_inputs and has_callback_step_end):
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        pipe = pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-        self.assertTrue(
-            hasattr(pipe, "_callback_tensor_inputs"),
-            f" {self.pipeline_class} should have `_callback_tensor_inputs` that defines a list of tensor variables its callback function can use as inputs",
-        )
-
-        def callback_inputs_subset(pipe, i, t, callback_kwargs):
-            # iterate over callback args
-            for tensor_name, tensor_value in callback_kwargs.items():
-                # check that we're only passing in allowed tensor inputs
-                assert tensor_name in pipe._callback_tensor_inputs
-
-            return callback_kwargs
-
-        def callback_inputs_all(pipe, i, t, callback_kwargs):
-            for tensor_name in pipe._callback_tensor_inputs:
-                assert tensor_name in callback_kwargs
-
-            # iterate over callback args
-            for tensor_name, tensor_value in callback_kwargs.items():
-                # check that we're only passing in allowed tensor inputs
-                assert tensor_name in pipe._callback_tensor_inputs
-
-            return callback_kwargs
-
-        inputs = self.get_dummy_inputs(torch_device)
-
-        # Test passing in a subset
-        inputs["callback_on_step_end"] = callback_inputs_subset
-        inputs["callback_on_step_end_tensor_inputs"] = ["latents"]
-        output = pipe(**inputs)[0]
-
-        # Test passing in a everything
-        inputs["callback_on_step_end"] = callback_inputs_all
-        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
-        output = pipe(**inputs)[0]
-
-        def callback_inputs_change_tensor(pipe, i, t, callback_kwargs):
-            is_last = i == (pipe.num_timesteps - 1)
-            if is_last:
-                callback_kwargs["latents"] = torch.zeros_like(callback_kwargs["latents"])
-            return callback_kwargs
-
-        inputs["callback_on_step_end"] = callback_inputs_change_tensor
-        inputs["callback_on_step_end_tensor_inputs"] = pipe._callback_tensor_inputs
-        output = pipe(**inputs)[0]
-        assert output.abs().sum() < 1e10
-
-    def test_attention_slicing_forward_pass(
-        self, test_max_difference=True, test_mean_pixel_difference=True, expected_max_diff=1e-3
-    ):
-        if not self.test_attention_slicing:
-            return
-
-        components = self.get_dummy_components()
-        pipe = self.pipeline_class(**components)
-        for component in pipe.components.values():
-            if hasattr(component, "set_default_attn_processor"):
-                component.set_default_attn_processor()
-        pipe.to(torch_device)
-        pipe.set_progress_bar_config(disable=None)
-
-        generator_device = "cpu"
-        inputs = self.get_dummy_inputs(generator_device)
-        output_without_slicing = pipe(**inputs)[0]
-
-        pipe.enable_attention_slicing(slice_size=1)
-        inputs = self.get_dummy_inputs(generator_device)
-        output_with_slicing1 = pipe(**inputs)[0]
-
-        pipe.enable_attention_slicing(slice_size=2)
-        inputs = self.get_dummy_inputs(generator_device)
-        output_with_slicing2 = pipe(**inputs)[0]
-
-        if test_max_difference:
-            max_diff1 = np.abs(to_np(output_with_slicing1) - to_np(output_without_slicing)).max()
-            max_diff2 = np.abs(to_np(output_with_slicing2) - to_np(output_without_slicing)).max()
-            self.assertLess(
-                max(max_diff1, max_diff2),
-                expected_max_diff,
-                "Attention slicing should not affect the inference results",
-            )
-
-    def test_vae_tiling(self, expected_diff_max: float = 0.2):
-        # Seems to require higher tolerance than the other tests
-        expected_diff_max = 0.6
-        generator_device = "cpu"
-        components = self.get_dummy_components()
-
-        pipe = self.pipeline_class(**components)
-        pipe.to("cpu")
-        pipe.set_progress_bar_config(disable=None)
-
-        # Without tiling
-        inputs = self.get_dummy_inputs(generator_device)
-        inputs["height"] = inputs["width"] = 128
-        output_without_tiling = pipe(**inputs)[0]
-
-        # With tiling
-        pipe.vae.enable_tiling(
-            tile_sample_min_height=96,
-            tile_sample_min_width=96,
-            tile_sample_stride_height=64,
-            tile_sample_stride_width=64,
-        )
-        inputs = self.get_dummy_inputs(generator_device)
-        inputs["height"] = inputs["width"] = 128
-        output_with_tiling = pipe(**inputs)[0]
-
-        self.assertLess(
-            (to_np(output_without_tiling) - to_np(output_with_tiling)).max(),
-            expected_diff_max,
-            "VAE tiling should not affect the inference results",
-        )
-
-    # TODO(aryan): Create a dummy gemma model with smol vocab size
-    @unittest.skip(
-        "A very small vocab size is used for fast tests. So, any kind of prompt other than the empty default used in other tests will lead to a embedding lookup error. This test uses a long prompt that causes the error."
-    )
-    def test_inference_batch_consistent(self):
-        pass
-
-    @unittest.skip(
-        "A very small vocab size is used for fast tests. So, any kind of prompt other than the empty default used in other tests will lead to a embedding lookup error. This test uses a long prompt that causes the error."
-    )
-    def test_inference_batch_single_identical(self):
-        pass
@@ -2291,6 +2291,7 @@ class PipelineTesterMixin:
            self.skipTest("No dummy components defined.")

        pipe = self.pipeline_class(**components)
+
        specified_key = next(iter(components.keys()))

        with tempfile.TemporaryDirectory(ignore_cleanup_errors=True) as tmpdirname:
@@ -1,190 +0,0 @@
-# coding=utf-8
-# Copyright 2024 The HuggingFace Team Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a clone of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import tempfile
-import unittest
-
-import torch
-
-from diffusers import DiffusionPipeline, QuantoConfig
-from diffusers.quantizers import PipelineQuantizationConfig
-from diffusers.utils.testing_utils import (
-    is_transformers_available,
-    require_accelerate,
-    require_bitsandbytes_version_greater,
-    require_quanto,
-    require_torch,
-    require_torch_accelerator,
-    slow,
-    torch_device,
-)
-
-
-if is_transformers_available():
-    from transformers import BitsAndBytesConfig as TranBitsAndBytesConfig
-else:
-    TranBitsAndBytesConfig = None
-
-
-@require_bitsandbytes_version_greater("0.43.2")
-@require_quanto
-@require_accelerate
-@require_torch
-@require_torch_accelerator
-@slow
-class PipelineQuantizationTests(unittest.TestCase):
-    model_name = "hf-internal-testing/tiny-flux-pipe"
-    prompt = "a beautiful sunset amidst the mountains."
-    num_inference_steps = 10
-    seed = 0
-
-    def test_quant_config_set_correctly_through_kwargs(self):
-        components_to_quantize = ["transformer", "text_encoder_2"]
-        quant_config = PipelineQuantizationConfig(
-            quant_backend="bitsandbytes_4bit",
-            quant_kwargs={
-                "load_in_4bit": True,
-                "bnb_4bit_quant_type": "nf4",
-                "bnb_4bit_compute_dtype": torch.bfloat16,
-            },
-            components_to_quantize=components_to_quantize,
-        )
-        pipe = DiffusionPipeline.from_pretrained(
-            self.model_name,
-            quantization_config=quant_config,
-            torch_dtype=torch.bfloat16,
-        ).to(torch_device)
-        for name, component in pipe.components.items():
-            if name in components_to_quantize:
-                self.assertTrue(getattr(component.config, "quantization_config", None) is not None)
-                quantization_config = component.config.quantization_config
-                self.assertTrue(quantization_config.load_in_4bit)
-                self.assertTrue(quantization_config.quant_method == "bitsandbytes")
-
-        _ = pipe(self.prompt, num_inference_steps=self.num_inference_steps)
-
-    def test_quant_config_set_correctly_through_granular(self):
-        quant_config = PipelineQuantizationConfig(
-            quant_mapping={
-                "transformer": QuantoConfig(weights_dtype="int8"),
-                "text_encoder_2": TranBitsAndBytesConfig(load_in_4bit=True, compute_dtype=torch.bfloat16),
-            }
-        )
-        components_to_quantize = list(quant_config.quant_mapping.keys())
-        pipe = DiffusionPipeline.from_pretrained(
-            self.model_name,
-            quantization_config=quant_config,
-            torch_dtype=torch.bfloat16,
-        ).to(torch_device)
-        for name, component in pipe.components.items():
-            if name in components_to_quantize:
-                self.assertTrue(getattr(component.config, "quantization_config", None) is not None)
-                quantization_config = component.config.quantization_config
-
-                if name == "text_encoder_2":
-                    self.assertTrue(quantization_config.load_in_4bit)
-                    self.assertTrue(quantization_config.quant_method == "bitsandbytes")
-                else:
-                    self.assertTrue(quantization_config.quant_method == "quanto")
-
-        _ = pipe(self.prompt, num_inference_steps=self.num_inference_steps)
-
-    def test_raises_error_for_invalid_config(self):
-        with self.assertRaises(ValueError) as err_context:
-            _ = PipelineQuantizationConfig(
-                quant_mapping={
-                    "transformer": QuantoConfig(weights_dtype="int8"),
-                    "text_encoder_2": TranBitsAndBytesConfig(load_in_4bit=True, compute_dtype=torch.bfloat16),
-                },
-                quant_backend="bitsandbytes_4bit",
-            )
-
-        self.assertTrue(
-            str(err_context.exception)
-            == "Both `quant_backend` and `quant_mapping` cannot be specified at the same time."
-        )
-
-    def test_validation_for_kwargs(self):
-        components_to_quantize = ["transformer", "text_encoder_2"]
-        with self.assertRaises(ValueError) as err_context:
-            _ = PipelineQuantizationConfig(
-                quant_backend="quanto",
-                quant_kwargs={"weights_dtype": "int8"},
-                components_to_quantize=components_to_quantize,
-            )
-
-        self.assertTrue(
-            "The signatures of the __init__ methods of the quantization config classes" in str(err_context.exception)
-        )
-
-    def test_raises_error_for_wrong_config_class(self):
-        quant_config = {
-            "transformer": QuantoConfig(weights_dtype="int8"),
-            "text_encoder_2": TranBitsAndBytesConfig(load_in_4bit=True, compute_dtype=torch.bfloat16),
-        }
-        with self.assertRaises(ValueError) as err_context:
-            _ = DiffusionPipeline.from_pretrained(
-                self.model_name,
-                quantization_config=quant_config,
-                torch_dtype=torch.bfloat16,
-            )
-        self.assertTrue(
-            str(err_context.exception) == "`quantization_config` must be an instance of `PipelineQuantizationConfig`."
-        )
-
-    def test_validation_for_mapping(self):
-        with self.assertRaises(ValueError) as err_context:
-            _ = PipelineQuantizationConfig(
-                quant_mapping={
-                    "transformer": DiffusionPipeline(),
-                    "text_encoder_2": TranBitsAndBytesConfig(load_in_4bit=True, compute_dtype=torch.bfloat16),
-                }
-            )
-
-        self.assertTrue("Provided config for module_name=transformer could not be found" in str(err_context.exception))
-
-    def test_saving_loading(self):
-        quant_config = PipelineQuantizationConfig(
-            quant_mapping={
-                "transformer": QuantoConfig(weights_dtype="int8"),
-                "text_encoder_2": TranBitsAndBytesConfig(load_in_4bit=True, compute_dtype=torch.bfloat16),
-            }
-        )
-        components_to_quantize = list(quant_config.quant_mapping.keys())
-        pipe = DiffusionPipeline.from_pretrained(
-            self.model_name,
-            quantization_config=quant_config,
-            torch_dtype=torch.bfloat16,
-        ).to(torch_device)
-
-        pipe_inputs = {"prompt": self.prompt, "num_inference_steps": self.num_inference_steps, "output_type": "latent"}
-        output_1 = pipe(**pipe_inputs, generator=torch.manual_seed(self.seed)).images
-
-        with tempfile.TemporaryDirectory() as tmpdir:
-            pipe.save_pretrained(tmpdir)
-            loaded_pipe = DiffusionPipeline.from_pretrained(tmpdir, torch_dtype=torch.bfloat16).to(torch_device)
-        for name, component in loaded_pipe.components.items():
-            if name in components_to_quantize:
-                self.assertTrue(getattr(component.config, "quantization_config", None) is not None)
-                quantization_config = component.config.quantization_config
-
-                if name == "text_encoder_2":
-                    self.assertTrue(quantization_config.load_in_4bit)
-                    self.assertTrue(quantization_config.quant_method == "bitsandbytes")
-                else:
-                    self.assertTrue(quantization_config.quant_method == "quanto")
-
-        output_2 = loaded_pipe(**pipe_inputs, generator=torch.manual_seed(self.seed)).images
-
-        self.assertTrue(torch.allclose(output_1, output_2))
Author	SHA1	Message	Date
sayakpaul	9862c93d6d	fix	2025-05-06 10:59:19 +05:30
sayakpaul	71ae84da4a	update dep table.	2025-05-06 10:57:38 +05:30