update

Merge branch 'main' into ruff-update
update
2025-04-09 15:34:42 +05:30 · 2025-04-09 15:22:55 +05:30 · 2025-03-08 08:17:14 +05:30 · 2025-03-08 08:07:10 +05:30 · 2025-03-08 08:05:08 +05:30 · 2025-02-27 17:08:37 +05:30
31 changed files with 365 additions and 532 deletions
@@ -265,8 +265,6 @@
    sections:
    - local: api/models/overview
      title: Overview
-    - local: api/models/auto_model
-      title: AutoModel
    - sections:
      - local: api/models/controlnet
        title: ControlNetModel
@@ -1,29 +0,0 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-->
-
-# AutoModel
-
-The `AutoModel` is designed to make it easy to load a checkpoint without needing to know the specific model class. `AutoModel` automatically retrieves the correct model class from the checkpoint `config.json` file.
-
-```python
-from diffusers import AutoModel, AutoPipelineForText2Image
-
-unet = AutoModel.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="unet")
-pipe = AutoPipelineForText2Image.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", unet=unet)
-```
-
-
-## AutoModel
-
-[[autodoc]] AutoModel
-	- all
-	- from_pretrained
@@ -53,12 +53,7 @@ def custom_convert_ldm_vae_checkpoint(checkpoint, config):
    }

    for i in range(num_down_blocks):
-        resnets = [
-            key
-            for key in down_blocks[i]
-            if f"down.{i}" in key and f"down.{i}.downsample" not in key and "attn" not in key
-        ]
-        attentions = [key for key in down_blocks[i] if f"down.{i}.attn" in key]
+        resnets = [key for key in down_blocks[i] if f"down.{i}" in key and f"down.{i}.downsample" not in key]

        if f"encoder.down.{i}.downsample.conv.weight" in vae_state_dict:
            new_checkpoint[f"encoder.down_blocks.{i}.downsamplers.0.conv.weight"] = vae_state_dict.pop(
@@ -72,10 +67,6 @@ def custom_convert_ldm_vae_checkpoint(checkpoint, config):
        meta_path = {"old": f"down.{i}.block", "new": f"down_blocks.{i}.resnets"}
        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)

-        paths = renew_vae_attention_paths(attentions)
-        meta_path = {"old": f"down.{i}.attn", "new": f"down_blocks.{i}.attentions"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
    mid_resnets = [key for key in vae_state_dict if "encoder.mid.block" in key]
    num_mid_res_blocks = 2
    for i in range(1, num_mid_res_blocks + 1):
@@ -94,11 +85,8 @@ def custom_convert_ldm_vae_checkpoint(checkpoint, config):
    for i in range(num_up_blocks):
        block_id = num_up_blocks - 1 - i
        resnets = [
-            key
-            for key in up_blocks[block_id]
-            if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key and "attn" not in key
+            key for key in up_blocks[block_id] if f"up.{block_id}" in key and f"up.{block_id}.upsample" not in key
        ]
-        attentions = [key for key in up_blocks[block_id] if f"up.{block_id}.attn" in key]

        if f"decoder.up.{block_id}.upsample.conv.weight" in vae_state_dict:
            new_checkpoint[f"decoder.up_blocks.{i}.upsamplers.0.conv.weight"] = vae_state_dict[
@@ -112,10 +100,6 @@ def custom_convert_ldm_vae_checkpoint(checkpoint, config):
        meta_path = {"old": f"up.{block_id}.block", "new": f"up_blocks.{i}.resnets"}
        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)

-        paths = renew_vae_attention_paths(attentions)
-        meta_path = {"old": f"up.{block_id}.attn", "new": f"up_blocks.{i}.attentions"}
-        assign_to_checkpoint(paths, new_checkpoint, vae_state_dict, additional_replacements=[meta_path], config=config)
-
    mid_resnets = [key for key in vae_state_dict if "decoder.mid.block" in key]
    num_mid_res_blocks = 2
    for i in range(1, num_mid_res_blocks + 1):
@@ -142,7 +142,6 @@ _deps = [
    "urllib3<=2.0.0",
    "black",
    "phonemizer",
-    "opencv-python",
 ]

 # this is a lookup table with items like:
@@ -14,7 +14,6 @@ from .utils import (
    is_librosa_available,
    is_note_seq_available,
    is_onnx_available,
-    is_opencv_available,
    is_optimum_quanto_available,
    is_scipy_available,
    is_sentencepiece_available,
@@ -353,6 +352,7 @@ else:
            "CogView3PlusPipeline",
            "CogView4ControlPipeline",
            "CogView4Pipeline",
+            "ConsisIDPipeline",
            "CycleDiffusionPipeline",
            "EasyAnimateControlPipeline",
            "EasyAnimateInpaintPipeline",
@@ -518,19 +518,6 @@ else:
        ]
    )

-try:
-    if not (is_torch_available() and is_transformers_available() and is_opencv_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from .utils import dummy_torch_and_transformers_and_opencv_objects  # noqa F403
-
-    _import_structure["utils.dummy_torch_and_transformers_and_opencv_objects"] = [
-        name for name in dir(dummy_torch_and_transformers_and_opencv_objects) if not name.startswith("_")
-    ]
-
-else:
-    _import_structure["pipelines"].extend(["ConsisIDPipeline"])
-
 try:
    if not (is_torch_available() and is_transformers_available() and is_k_diffusion_available()):
        raise OptionalDependencyNotAvailable()
@@ -922,6 +909,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
            CogView3PlusPipeline,
            CogView4ControlPipeline,
            CogView4Pipeline,
+            ConsisIDPipeline,
            CycleDiffusionPipeline,
            EasyAnimateControlPipeline,
            EasyAnimateInpaintPipeline,
@@ -1100,15 +1088,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        from .utils.dummy_torch_and_transformers_and_sentencepiece_objects import *  # noqa F403
    else:
        from .pipelines import KolorsImg2ImgPipeline, KolorsPAGPipeline, KolorsPipeline
-
-    try:
-        if not (is_torch_available() and is_transformers_available() and is_opencv_available()):
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        from .utils.dummy_torch_and_transformers_and_opencv_objects import *  # noqa F403
-    else:
-        from .pipelines import ConsisIDPipeline
-
    try:
        if not (is_torch_available() and is_transformers_available() and is_onnx_available()):
            raise OptionalDependencyNotAvailable()
@@ -49,5 +49,4 @@ deps = {
    "urllib3": "urllib3<=2.0.0",
    "black": "black",
    "phonemizer": "phonemizer",
-    "opencv-python": "opencv-python",
 }
@@ -1608,64 +1608,3 @@ def _convert_non_diffusers_wan_lora_to_diffusers(state_dict):
        converted_state_dict[f"transformer.{key}"] = converted_state_dict.pop(key)

    return converted_state_dict
-
-
-def _convert_musubi_wan_lora_to_diffusers(state_dict):
-    # https://github.com/kohya-ss/musubi-tuner
-    converted_state_dict = {}
-    original_state_dict = {k[len("lora_unet_") :]: v for k, v in state_dict.items()}
-
-    num_blocks = len({k.split("blocks_")[1].split("_")[0] for k in original_state_dict})
-    is_i2v_lora = any("k_img" in k for k in original_state_dict) and any("v_img" in k for k in original_state_dict)
-
-    def get_alpha_scales(down_weight, key):
-        rank = down_weight.shape[0]
-        alpha = original_state_dict.pop(key + ".alpha").item()
-        scale = alpha / rank  # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
-        scale_down = scale
-        scale_up = 1.0
-        while scale_down * 2 < scale_up:
-            scale_down *= 2
-            scale_up /= 2
-        return scale_down, scale_up
-
-    for i in range(num_blocks):
-        # Self-attention
-        for o, c in zip(["q", "k", "v", "o"], ["to_q", "to_k", "to_v", "to_out.0"]):
-            down_weight = original_state_dict.pop(f"blocks_{i}_self_attn_{o}.lora_down.weight")
-            up_weight = original_state_dict.pop(f"blocks_{i}_self_attn_{o}.lora_up.weight")
-            scale_down, scale_up = get_alpha_scales(down_weight, f"blocks_{i}_self_attn_{o}")
-            converted_state_dict[f"blocks.{i}.attn1.{c}.lora_A.weight"] = down_weight * scale_down
-            converted_state_dict[f"blocks.{i}.attn1.{c}.lora_B.weight"] = up_weight * scale_up
-
-        # Cross-attention
-        for o, c in zip(["q", "k", "v", "o"], ["to_q", "to_k", "to_v", "to_out.0"]):
-            down_weight = original_state_dict.pop(f"blocks_{i}_cross_attn_{o}.lora_down.weight")
-            up_weight = original_state_dict.pop(f"blocks_{i}_cross_attn_{o}.lora_up.weight")
-            scale_down, scale_up = get_alpha_scales(down_weight, f"blocks_{i}_cross_attn_{o}")
-            converted_state_dict[f"blocks.{i}.attn2.{c}.lora_A.weight"] = down_weight * scale_down
-            converted_state_dict[f"blocks.{i}.attn2.{c}.lora_B.weight"] = up_weight * scale_up
-
-        if is_i2v_lora:
-            for o, c in zip(["k_img", "v_img"], ["add_k_proj", "add_v_proj"]):
-                down_weight = original_state_dict.pop(f"blocks_{i}_cross_attn_{o}.lora_down.weight")
-                up_weight = original_state_dict.pop(f"blocks_{i}_cross_attn_{o}.lora_up.weight")
-                scale_down, scale_up = get_alpha_scales(down_weight, f"blocks_{i}_cross_attn_{o}")
-                converted_state_dict[f"blocks.{i}.attn2.{c}.lora_A.weight"] = down_weight * scale_down
-                converted_state_dict[f"blocks.{i}.attn2.{c}.lora_B.weight"] = up_weight * scale_up
-
-        # FFN
-        for o, c in zip(["ffn_0", "ffn_2"], ["net.0.proj", "net.2"]):
-            down_weight = original_state_dict.pop(f"blocks_{i}_{o}.lora_down.weight")
-            up_weight = original_state_dict.pop(f"blocks_{i}_{o}.lora_up.weight")
-            scale_down, scale_up = get_alpha_scales(down_weight, f"blocks_{i}_{o}")
-            converted_state_dict[f"blocks.{i}.ffn.{c}.lora_A.weight"] = down_weight * scale_down
-            converted_state_dict[f"blocks.{i}.ffn.{c}.lora_B.weight"] = up_weight * scale_up
-
-    if len(original_state_dict) > 0:
-        raise ValueError(f"`state_dict` should be empty at this point but has {original_state_dict.keys()=}")
-
-    for key in list(converted_state_dict.keys()):
-        converted_state_dict[f"transformer.{key}"] = converted_state_dict.pop(key)
-
-    return converted_state_dict
@@ -42,7 +42,6 @@ from .lora_conversion_utils import (
    _convert_bfl_flux_control_lora_to_diffusers,
    _convert_hunyuan_video_lora_to_diffusers,
    _convert_kohya_flux_lora_to_diffusers,
-    _convert_musubi_wan_lora_to_diffusers,
    _convert_non_diffusers_lora_to_diffusers,
    _convert_non_diffusers_lumina2_lora_to_diffusers,
    _convert_non_diffusers_wan_lora_to_diffusers,
@@ -4795,8 +4794,6 @@ class WanLoraLoaderMixin(LoraBaseMixin):
        )
        if any(k.startswith("diffusion_model.") for k in state_dict):
            state_dict = _convert_non_diffusers_wan_lora_to_diffusers(state_dict)
-        elif any(k.startswith("lora_unet_") for k in state_dict):
-            state_dict = _convert_musubi_wan_lora_to_diffusers(state_dict)

        is_dora_scale_present = any("dora_scale" in k for k in state_dict)
        if is_dora_scale_present:
@@ -177,7 +177,6 @@ DIFFUSERS_DEFAULT_PIPELINE_PATHS = {
    "flux-schnell": {"pretrained_model_name_or_path": "black-forest-labs/FLUX.1-schnell"},
    "ltx-video": {"pretrained_model_name_or_path": "diffusers/LTX-Video-0.9.0"},
    "ltx-video-0.9.1": {"pretrained_model_name_or_path": "diffusers/LTX-Video-0.9.1"},
-    "ltx-video-0.9.5": {"pretrained_model_name_or_path": "Lightricks/LTX-Video-0.9.5"},
    "autoencoder-dc-f128c512": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f128c512-mix-1.0-diffusers"},
    "autoencoder-dc-f64c128": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f64c128-mix-1.0-diffusers"},
    "autoencoder-dc-f32c32": {"pretrained_model_name_or_path": "mit-han-lab/dc-ae-f32c32-mix-1.0-diffusers"},
@@ -639,9 +638,7 @@ def infer_diffusers_model_type(checkpoint):
            model_type = "flux-schnell"

    elif any(key in checkpoint for key in CHECKPOINT_KEY_NAMES["ltx-video"]):
-        if checkpoint["vae.encoder.conv_out.conv.weight"].shape[1] == 2048:
-            model_type = "ltx-video-0.9.5"
-        elif "vae.decoder.last_time_embedder.timestep_embedder.linear_1.weight" in checkpoint:
+        if "vae.decoder.last_time_embedder.timestep_embedder.linear_1.weight" in checkpoint:
            model_type = "ltx-video-0.9.1"
        else:
            model_type = "ltx-video"
@@ -2406,41 +2403,13 @@ def convert_ltx_vae_checkpoint_to_diffusers(checkpoint, **kwargs):
        "last_scale_shift_table": "scale_shift_table",
    }

-    VAE_095_RENAME_DICT = {
-        # decoder
-        "up_blocks.0": "mid_block",
-        "up_blocks.1": "up_blocks.0.upsamplers.0",
-        "up_blocks.2": "up_blocks.0",
-        "up_blocks.3": "up_blocks.1.upsamplers.0",
-        "up_blocks.4": "up_blocks.1",
-        "up_blocks.5": "up_blocks.2.upsamplers.0",
-        "up_blocks.6": "up_blocks.2",
-        "up_blocks.7": "up_blocks.3.upsamplers.0",
-        "up_blocks.8": "up_blocks.3",
-        # encoder
-        "down_blocks.0": "down_blocks.0",
-        "down_blocks.1": "down_blocks.0.downsamplers.0",
-        "down_blocks.2": "down_blocks.1",
-        "down_blocks.3": "down_blocks.1.downsamplers.0",
-        "down_blocks.4": "down_blocks.2",
-        "down_blocks.5": "down_blocks.2.downsamplers.0",
-        "down_blocks.6": "down_blocks.3",
-        "down_blocks.7": "down_blocks.3.downsamplers.0",
-        "down_blocks.8": "mid_block",
-        # common
-        "last_time_embedder": "time_embedder",
-        "last_scale_shift_table": "scale_shift_table",
-    }
-
    VAE_SPECIAL_KEYS_REMAP = {
        "per_channel_statistics.channel": remove_keys_,
        "per_channel_statistics.mean-of-means": remove_keys_,
        "per_channel_statistics.mean-of-stds": remove_keys_,
    }

-    if converted_state_dict["vae.encoder.conv_out.conv.weight"].shape[1] == 2048:
-        VAE_KEYS_RENAME_DICT.update(VAE_095_RENAME_DICT)
-    elif "vae.decoder.last_time_embedder.timestep_embedder.linear_1.weight" in converted_state_dict:
+    if "vae.decoder.last_time_embedder.timestep_embedder.linear_1.weight" in converted_state_dict:
        VAE_KEYS_RENAME_DICT.update(VAE_091_RENAME_DICT)

    for key in list(converted_state_dict.keys()):
@@ -298,6 +298,15 @@ class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
        )
        encoder_hidden_states = self.context_embedder(encoder_hidden_states)

+        if self.union:
+            # union mode
+            if controlnet_mode is None:
+                raise ValueError("`controlnet_mode` cannot be `None` when applying ControlNet-Union")
+            # union mode emb
+            controlnet_mode_emb = self.controlnet_mode_embedder(controlnet_mode)
+            encoder_hidden_states = torch.cat([controlnet_mode_emb, encoder_hidden_states], dim=1)
+            txt_ids = torch.cat([txt_ids[:1], txt_ids], dim=0)
+
        if txt_ids.ndim == 3:
            logger.warning(
                "Passing `txt_ids` 3d torch.Tensor is deprecated."
@@ -311,15 +320,6 @@ class FluxControlNetModel(ModelMixin, ConfigMixin, PeftAdapterMixin):
            )
            img_ids = img_ids[0]

-        if self.union:
-            # union mode
-            if controlnet_mode is None:
-                raise ValueError("`controlnet_mode` cannot be `None` when applying ControlNet-Union")
-            # union mode emb
-            controlnet_mode_emb = self.controlnet_mode_embedder(controlnet_mode)
-            encoder_hidden_states = torch.cat([controlnet_mode_emb, encoder_hidden_states], dim=1)
-            txt_ids = torch.cat([txt_ids[:1], txt_ids], dim=0)
-
        ids = torch.cat((txt_ids, img_ids), dim=0)
        image_rotary_emb = self.pos_embed(ids)

@@ -10,7 +10,6 @@ from ..utils import (
    is_librosa_available,
    is_note_seq_available,
    is_onnx_available,
-    is_opencv_available,
    is_sentencepiece_available,
    is_torch_available,
    is_torch_npu_available,
@@ -156,6 +155,7 @@ else:
    ]
    _import_structure["cogview3"] = ["CogView3PlusPipeline"]
    _import_structure["cogview4"] = ["CogView4Pipeline", "CogView4ControlPipeline"]
+    _import_structure["consisid"] = ["ConsisIDPipeline"]
    _import_structure["controlnet"].extend(
        [
            "BlipDiffusionControlNetPipeline",
@@ -414,18 +414,6 @@ else:
        "KolorsImg2ImgPipeline",
    ]

-try:
-    if not (is_torch_available() and is_transformers_available() and is_opencv_available()):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    from ..utils import (
-        dummy_torch_and_transformers_and_opencv_objects,
-    )
-
-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_opencv_objects))
-else:
-    _import_structure["consisid"] = ["ConsisIDPipeline"]
-
 try:
    if not is_flax_available():
        raise OptionalDependencyNotAvailable()
@@ -524,6 +512,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
        )
        from .cogview3 import CogView3PlusPipeline
        from .cogview4 import CogView4ControlPipeline, CogView4Pipeline
+        from .consisid import ConsisIDPipeline
        from .controlnet import (
            BlipDiffusionControlNetPipeline,
            StableDiffusionControlNetImg2ImgPipeline,
@@ -772,14 +761,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
                KolorsPipeline,
            )

-        try:
-            if not (is_torch_available() and is_transformers_available() and is_opencv_available()):
-                raise OptionalDependencyNotAvailable()
-        except OptionalDependencyNotAvailable:
-            from ..utils.dummy_torch_and_transformers_and_opencv_objects import *
-        else:
-            from .consisid import ConsisIDPipeline
-
        try:
            if not is_flax_available():
                raise OptionalDependencyNotAvailable()
@@ -5,7 +5,6 @@ from ...utils import (
    OptionalDependencyNotAvailable,
    _LazyModule,
    get_objects_from_module,
-    is_opencv_available,
    is_torch_available,
    is_transformers_available,
 )
@@ -16,12 +15,12 @@ _import_structure = {}


 try:
-    if not (is_transformers_available() and is_torch_available() and is_opencv_available()):
+    if not (is_transformers_available() and is_torch_available()):
        raise OptionalDependencyNotAvailable()
 except OptionalDependencyNotAvailable:
-    from ...utils import dummy_torch_and_transformers_and_opencv_objects  # noqa F403
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403

-    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_and_opencv_objects))
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
 else:
    _import_structure["pipeline_consisid"] = ["ConsisIDPipeline"]

@@ -16,6 +16,7 @@ import inspect
 import math
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union

+import cv2
 import numpy as np
 import PIL
 import torch
@@ -28,16 +29,12 @@ from ...models import AutoencoderKLCogVideoX, ConsisIDTransformer3DModel
 from ...models.embeddings import get_3d_rotary_pos_embed
 from ...pipelines.pipeline_utils import DiffusionPipeline
 from ...schedulers import CogVideoXDPMScheduler
-from ...utils import is_opencv_available, logging, replace_example_docstring
+from ...utils import logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ...video_processor import VideoProcessor
 from .pipeline_output import ConsisIDPipelineOutput


-if is_opencv_available():
-    import cv2
-
-
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name


@@ -350,14 +350,8 @@ def create_vae_diffusers_config(original_config, image_size: int):
    _ = original_config["model"]["params"]["first_stage_config"]["params"]["embed_dim"]

    block_out_channels = [vae_params["ch"] * mult for mult in vae_params["ch_mult"]]
-    down_block_types = [
-        "DownEncoderBlock2D" if image_size // 2**i not in vae_params["attn_resolutions"] else "AttnDownEncoderBlock2D"
-        for i, _ in enumerate(block_out_channels)
-    ]
-    up_block_types = [
-        "UpDecoderBlock2D" if image_size // 2**i not in vae_params["attn_resolutions"] else "AttnUpDecoderBlock2D"
-        for i, _ in enumerate(block_out_channels)
-    ][::-1]
+    down_block_types = ["DownEncoderBlock2D"] * len(block_out_channels)
+    up_block_types = ["UpDecoderBlock2D"] * len(block_out_channels)

    config = {
        "sample_size": image_size,
@@ -15,6 +15,7 @@
 import html
 from typing import Any, Callable, Dict, List, Optional, Union

+import ftfy
 import regex as re
 import torch
 from transformers import AutoTokenizer, UMT5EncoderModel
@@ -23,7 +24,7 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...loaders import WanLoraLoaderMixin
 from ...models import AutoencoderKLWan, WanTransformer3DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
-from ...utils import is_ftfy_available, is_torch_xla_available, logging, replace_example_docstring
+from ...utils import is_torch_xla_available, logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ...video_processor import VideoProcessor
 from ..pipeline_utils import DiffusionPipeline
@@ -39,9 +40,6 @@ else:

 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

-if is_ftfy_available():
-    import ftfy
-

 EXAMPLE_DOC_STRING = """
    Examples:
@@ -15,6 +15,7 @@
 import html
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union

+import ftfy
 import PIL
 import regex as re
 import torch
@@ -25,7 +26,7 @@ from ...image_processor import PipelineImageInput
 from ...loaders import WanLoraLoaderMixin
 from ...models import AutoencoderKLWan, WanTransformer3DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
-from ...utils import is_ftfy_available, is_torch_xla_available, logging, replace_example_docstring
+from ...utils import is_torch_xla_available, logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ...video_processor import VideoProcessor
 from ..pipeline_utils import DiffusionPipeline
@@ -41,9 +42,6 @@ else:

 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

-if is_ftfy_available():
-    import ftfy
-
 EXAMPLE_DOC_STRING = """
    Examples:
        ```python
@@ -16,6 +16,7 @@ import html
 import inspect
 from typing import Any, Callable, Dict, List, Optional, Union

+import ftfy
 import regex as re
 import torch
 from PIL import Image
@@ -25,7 +26,7 @@ from ...callbacks import MultiPipelineCallbacks, PipelineCallback
 from ...loaders import WanLoraLoaderMixin
 from ...models import AutoencoderKLWan, WanTransformer3DModel
 from ...schedulers import FlowMatchEulerDiscreteScheduler
-from ...utils import is_ftfy_available, is_torch_xla_available, logging, replace_example_docstring
+from ...utils import is_torch_xla_available, logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ...video_processor import VideoProcessor
 from ..pipeline_utils import DiffusionPipeline
@@ -41,9 +42,6 @@ else:

 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name

-if is_ftfy_available():
-    import ftfy
-

 EXAMPLE_DOC_STRING = """
    Examples:
@@ -79,7 +79,6 @@ from .import_utils import (
    is_matplotlib_available,
    is_note_seq_available,
    is_onnx_available,
-    is_opencv_available,
    is_optimum_quanto_available,
    is_optimum_quanto_version,
    is_peft_available,
@@ -1,17 +0,0 @@
-# This file is autogenerated by the command `make fix-copies`, do not edit.
-from ..utils import DummyObject, requires_backends
-
-
-class ConsisIDPipeline(metaclass=DummyObject):
-    _backends = ["torch", "transformers", "opencv"]
-
-    def __init__(self, *args, **kwargs):
-        requires_backends(self, ["torch", "transformers", "opencv"])
-
-    @classmethod
-    def from_config(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers", "opencv"])
-
-    @classmethod
-    def from_pretrained(cls, *args, **kwargs):
-        requires_backends(cls, ["torch", "transformers", "opencv"])
@@ -392,6 +392,21 @@ class CogView4Pipeline(metaclass=DummyObject):
        requires_backends(cls, ["torch", "transformers"])


+class ConsisIDPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+
+
 class CycleDiffusionPipeline(metaclass=DummyObject):
    _backends = ["torch", "transformers"]

@@ -101,20 +101,18 @@ _onnx_available = importlib.util.find_spec("onnxruntime") is not None
 if _onnx_available:
    candidates = (
        "onnxruntime",
-        "onnxruntime-cann",
-        "onnxruntime-directml",
-        "ort_nightly_directml",
        "onnxruntime-gpu",
        "ort_nightly_gpu",
-        "onnxruntime-migraphx",
+        "onnxruntime-directml",
        "onnxruntime-openvino",
-        "onnxruntime-qnn",
+        "ort_nightly_directml",
        "onnxruntime-rocm",
+        "onnxruntime-migraphx",
        "onnxruntime-training",
        "onnxruntime-vitisai",
    )
    _onnxruntime_version = None
-    # For the metadata, we have to look for both onnxruntime and onnxruntime-x
+    # For the metadata, we have to look for both onnxruntime and onnxruntime-gpu
    for pkg in candidates:
        try:
            _onnxruntime_version = importlib_metadata.version(pkg)
@@ -33,7 +33,6 @@ from diffusers import (
 )
 from diffusers.utils.import_utils import is_accelerate_available
 from diffusers.utils.testing_utils import (
-    Expectations,
    backend_empty_cache,
    load_image,
    nightly,
@@ -456,54 +455,11 @@ class LoraIntegrationTests(unittest.TestCase):

        images = pipe("A pokemon with blue eyes.", output_type="np", generator=generator, num_inference_steps=2).images

-        image_slice = images[0, -3:, -3:, -1].flatten()
+        images = images[0, -3:, -3:, -1].flatten()

-        expected_slices = Expectations(
-            {
-                ("xpu", 3): np.array(
-                    [
-                        0.6544,
-                        0.6127,
-                        0.5397,
-                        0.6845,
-                        0.6047,
-                        0.5469,
-                        0.6349,
-                        0.5906,
-                        0.5382,
-                    ]
-                ),
-                ("cuda", 7): np.array(
-                    [
-                        0.7406,
-                        0.699,
-                        0.5963,
-                        0.7493,
-                        0.7045,
-                        0.6096,
-                        0.6886,
-                        0.6388,
-                        0.583,
-                    ]
-                ),
-                ("cuda", 8): np.array(
-                    [
-                        0.6542,
-                        0.61253,
-                        0.5396,
-                        0.6843,
-                        0.6044,
-                        0.5468,
-                        0.6349,
-                        0.5905,
-                        0.5381,
-                    ]
-                ),
-            }
-        )
-        expected_slice = expected_slices.get_expectation()
+        expected = np.array([0.7406, 0.699, 0.5963, 0.7493, 0.7045, 0.6096, 0.6886, 0.6388, 0.583])

-        max_diff = numpy_cosine_similarity_distance(expected_slice, image_slice)
+        max_diff = numpy_cosine_similarity_distance(expected, images)
        assert max_diff < 1e-4

        pipe.unload_lora_weights()
@@ -260,31 +260,6 @@ class PeftLoraLoaderMixinTests:

        return modules_to_save

-    def check_if_adapters_added_correctly(
-        self, pipe, text_lora_config=None, denoiser_lora_config=None, adapter_name="default"
-    ):
-        if text_lora_config is not None:
-            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
-                pipe.text_encoder.add_adapter(text_lora_config, adapter_name=adapter_name)
-                self.assertTrue(
-                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
-                )
-
-        if denoiser_lora_config is not None:
-            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
-            denoiser.add_adapter(denoiser_lora_config, adapter_name=adapter_name)
-            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
-        else:
-            denoiser = None
-
-        if text_lora_config is not None and self.has_two_text_encoders or self.has_three_text_encoders:
-            if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
-                pipe.text_encoder_2.add_adapter(text_lora_config, adapter_name=adapter_name)
-                self.assertTrue(
-                    check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
-                )
-        return pipe, denoiser
-
    def test_simple_inference(self):
        """
        Tests a simple inference and makes sure it works as expected
@@ -314,7 +289,16 @@ class PeftLoraLoaderMixinTests:
            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(output_no_lora.shape == self.output_shape)

-            pipe, _ = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config=None)
+            pipe.text_encoder.add_adapter(text_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                lora_loadable_components = self.pipeline_class._lora_loadable_modules
+                if "text_encoder_2" in lora_loadable_components:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )

            output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(
@@ -397,7 +381,22 @@ class PeftLoraLoaderMixinTests:
            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(output_no_lora.shape == self.output_shape)

-            pipe, _ = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config)
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )

            images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]

@@ -460,7 +459,16 @@ class PeftLoraLoaderMixinTests:
            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(output_no_lora.shape == self.output_shape)

-            pipe, _ = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config=None)
+            pipe.text_encoder.add_adapter(text_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                lora_loadable_components = self.pipeline_class._lora_loadable_modules
+                if "text_encoder_2" in lora_loadable_components:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )

            output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(
@@ -498,7 +506,15 @@ class PeftLoraLoaderMixinTests:
            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(output_no_lora.shape == self.output_shape)

-            pipe, _ = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config=None)
+            pipe.text_encoder.add_adapter(text_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )

            pipe.fuse_lora()
            # Fusing should still keep the LoRA layers
@@ -530,7 +546,19 @@ class PeftLoraLoaderMixinTests:
            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(output_no_lora.shape == self.output_shape)

-            pipe, _ = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config=None)
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                lora_loadable_components = self.pipeline_class._lora_loadable_modules
+                if "text_encoder_2" in lora_loadable_components:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )

            pipe.unload_lora_weights()
            # unloading should remove the LoRA layers
@@ -565,7 +593,18 @@ class PeftLoraLoaderMixinTests:
            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(output_no_lora.shape == self.output_shape)

-            pipe, _ = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config=None)
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )

            images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]

@@ -616,20 +655,22 @@ class PeftLoraLoaderMixinTests:
            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(output_no_lora.shape == self.output_shape)

-            pipe, _ = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config=None)
-
-            state_dict = {}
-            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
-                # Gather the state dict for the PEFT model, excluding `layers.4`, to ensure `load_lora_into_text_encoder`
-                # supports missing layers (PR#8324).
-                state_dict = {
-                    f"text_encoder.{module_name}": param
-                    for module_name, param in get_peft_model_state_dict(pipe.text_encoder).items()
-                    if "text_model.encoder.layers.4" not in module_name
-                }
+            pipe.text_encoder.add_adapter(text_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+            # Gather the state dict for the PEFT model, excluding `layers.4`, to ensure `load_lora_into_text_encoder`
+            # supports missing layers (PR#8324).
+            state_dict = {
+                f"text_encoder.{module_name}": param
+                for module_name, param in get_peft_model_state_dict(pipe.text_encoder).items()
+                if "text_model.encoder.layers.4" not in module_name
+            }

            if self.has_two_text_encoders or self.has_three_text_encoders:
                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
                    state_dict.update(
                        {
                            f"text_encoder_2.{module_name}": param
@@ -653,7 +694,7 @@ class PeftLoraLoaderMixinTests:
                "Removing adapters should change the output",
            )

-    def test_simple_inference_save_pretrained_with_text_lora(self):
+    def test_simple_inference_save_pretrained(self):
        """
        Tests a simple usecase where users could use saving utilities for LoRA through save_pretrained
        """
@@ -667,7 +708,16 @@ class PeftLoraLoaderMixinTests:
            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(output_no_lora.shape == self.output_shape)

-            pipe, _ = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config=None)
+            pipe.text_encoder.add_adapter(text_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )
+
            images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]

            with tempfile.TemporaryDirectory() as tmpdirname:
@@ -676,11 +726,10 @@ class PeftLoraLoaderMixinTests:
                pipe_from_pretrained = self.pipeline_class.from_pretrained(tmpdirname)
                pipe_from_pretrained.to(torch_device)

-            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
-                self.assertTrue(
-                    check_if_lora_correctly_set(pipe_from_pretrained.text_encoder),
-                    "Lora not correctly set in text encoder",
-                )
+            self.assertTrue(
+                check_if_lora_correctly_set(pipe_from_pretrained.text_encoder),
+                "Lora not correctly set in text encoder",
+            )

            if self.has_two_text_encoders or self.has_three_text_encoders:
                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
@@ -710,7 +759,22 @@ class PeftLoraLoaderMixinTests:
            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(output_no_lora.shape == self.output_shape)

-            pipe, _ = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config)
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )

            images_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]

@@ -756,7 +820,22 @@ class PeftLoraLoaderMixinTests:
            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(output_no_lora.shape == self.output_shape)

-            pipe, _ = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config)
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )

            output_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(
@@ -800,7 +879,22 @@ class PeftLoraLoaderMixinTests:
            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(output_no_lora.shape == self.output_shape)

-            pipe, denoiser = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config)
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )

            pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules)

@@ -838,7 +932,22 @@ class PeftLoraLoaderMixinTests:
            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(output_no_lora.shape == self.output_shape)

-            pipe, denoiser = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config)
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )

            pipe.unload_lora_weights()
            # unloading should remove the LoRA layers
@@ -874,7 +983,22 @@ class PeftLoraLoaderMixinTests:
            pipe.set_progress_bar_config(disable=None)
            _, _, inputs = self.get_dummy_inputs(with_generator=False)

-            pipe, denoiser = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config)
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )

            pipe.fuse_lora(components=self.pipeline_class._lora_loadable_modules)
            output_fused_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
@@ -980,8 +1104,6 @@ class PeftLoraLoaderMixinTests:
            )

    def test_wrong_adapter_name_raises_error(self):
-        adapter_name = "adapter-1"
-
        scheduler_cls = self.scheduler_classes[0]
        components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
        pipe = self.pipeline_class(**components)
@@ -989,9 +1111,20 @@ class PeftLoraLoaderMixinTests:
        pipe.set_progress_bar_config(disable=None)
        _, _, inputs = self.get_dummy_inputs(with_generator=False)

-        pipe, _ = self.check_if_adapters_added_correctly(
-            pipe, text_lora_config, denoiser_lora_config, adapter_name=adapter_name
-        )
+        if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+            pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+        denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+        denoiser.add_adapter(denoiser_lora_config, "adapter-1")
+        self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+        if self.has_two_text_encoders or self.has_three_text_encoders:
+            if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1")
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                )

        with self.assertRaises(ValueError) as err_context:
            pipe.set_adapters("test")
@@ -999,11 +1132,10 @@ class PeftLoraLoaderMixinTests:
        self.assertTrue("not in the list of present adapters" in str(err_context.exception))

        # test this works.
-        pipe.set_adapters(adapter_name)
+        pipe.set_adapters("adapter-1")
        _ = pipe(**inputs, generator=torch.manual_seed(0))[0]

    def test_multiple_wrong_adapter_name_raises_error(self):
-        adapter_name = "adapter-1"
        scheduler_cls = self.scheduler_classes[0]
        components, text_lora_config, denoiser_lora_config = self.get_dummy_components(scheduler_cls)
        pipe = self.pipeline_class(**components)
@@ -1011,22 +1143,33 @@ class PeftLoraLoaderMixinTests:
        pipe.set_progress_bar_config(disable=None)
        _, _, inputs = self.get_dummy_inputs(with_generator=False)

-        pipe, _ = self.check_if_adapters_added_correctly(
-            pipe, text_lora_config, denoiser_lora_config, adapter_name=adapter_name
-        )
+        if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+            pipe.text_encoder.add_adapter(text_lora_config, "adapter-1")
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+        denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+        denoiser.add_adapter(denoiser_lora_config, "adapter-1")
+        self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+        if self.has_two_text_encoders or self.has_three_text_encoders:
+            if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder_2.add_adapter(text_lora_config, "adapter-1")
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                )

        scale_with_wrong_components = {"foo": 0.0, "bar": 0.0, "tik": 0.0}
        logger = logging.get_logger("diffusers.loaders.lora_base")
        logger.setLevel(30)
        with CaptureLogger(logger) as cap_logger:
-            pipe.set_adapters(adapter_name, adapter_weights=scale_with_wrong_components)
+            pipe.set_adapters("adapter-1", adapter_weights=scale_with_wrong_components)

        wrong_components = sorted(set(scale_with_wrong_components.keys()))
        msg = f"The following components in `adapter_weights` are not part of the pipeline: {wrong_components}. "
        self.assertTrue(msg in str(cap_logger.out))

        # test this works.
-        pipe.set_adapters(adapter_name)
+        pipe.set_adapters("adapter-1")
        _ = pipe(**inputs, generator=torch.manual_seed(0))[0]

    def test_simple_inference_with_text_denoiser_block_scale(self):
@@ -1661,7 +1804,20 @@ class PeftLoraLoaderMixinTests:
            output_no_dora_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(output_no_dora_lora.shape == self.output_shape)

-            pipe, _ = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config)
+            pipe.text_encoder.add_adapter(text_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                lora_loadable_components = self.pipeline_class._lora_loadable_modules
+                if "text_encoder_2" in lora_loadable_components:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )

            output_dora_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]

@@ -1752,7 +1908,18 @@ class PeftLoraLoaderMixinTests:
            pipe.set_progress_bar_config(disable=None)
            _, _, inputs = self.get_dummy_inputs(with_generator=False)

-            pipe, _ = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config)
+            pipe.text_encoder.add_adapter(text_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder")
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                pipe.text_encoder_2.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                )

            pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
            pipe.text_encoder = torch.compile(pipe.text_encoder, mode="reduce-overhead", fullgraph=True)
@@ -1844,7 +2011,22 @@ class PeftLoraLoaderMixinTests:
            output_no_lora = pipe(**inputs, generator=torch.manual_seed(0))[0]
            self.assertTrue(output_no_lora.shape == self.output_shape)

-            pipe, _ = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config)
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )

            lora_scale = 0.5
            attention_kwargs = {attention_kwargs_name: {"scale": lora_scale}}
@@ -2029,7 +2211,22 @@ class PeftLoraLoaderMixinTests:
            pipe = pipe.to(torch_device, dtype=compute_dtype)
            pipe.set_progress_bar_config(disable=None)

-            pipe, denoiser = self.check_if_adapters_added_correctly(pipe, text_lora_config, denoiser_lora_config)
+            if "text_encoder" in self.pipeline_class._lora_loadable_modules:
+                pipe.text_encoder.add_adapter(text_lora_config)
+                self.assertTrue(
+                    check_if_lora_correctly_set(pipe.text_encoder), "Lora not correctly set in text encoder"
+                )
+
+            denoiser = pipe.transformer if self.unet_kwargs is None else pipe.unet
+            denoiser.add_adapter(denoiser_lora_config)
+            self.assertTrue(check_if_lora_correctly_set(denoiser), "Lora not correctly set in denoiser.")
+
+            if self.has_two_text_encoders or self.has_three_text_encoders:
+                if "text_encoder_2" in self.pipeline_class._lora_loadable_modules:
+                    pipe.text_encoder_2.add_adapter(text_lora_config)
+                    self.assertTrue(
+                        check_if_lora_correctly_set(pipe.text_encoder_2), "Lora not correctly set in text encoder 2"
+                    )

            if storage_dtype is not None:
                denoiser.enable_layerwise_casting(storage_dtype=storage_dtype, compute_dtype=compute_dtype)
@@ -37,8 +37,6 @@ class DependencyTester(unittest.TestCase):
                        backend = "k-diffusion"
                    elif backend == "invisible_watermark":
                        backend = "invisible-watermark"
-                    elif backend == "opencv":
-                        backend = "opencv-python"
                    assert backend in deps, f"{backend} is not in the deps table!"

    def test_pipeline_imports(self):
@@ -8,7 +8,6 @@ import torch
 from diffusers import FluxPipeline, FluxPriorReduxPipeline
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
-    Expectations,
    backend_empty_cache,
    numpy_cosine_similarity_distance,
    require_big_accelerator,
@@ -22,7 +21,7 @@ from diffusers.utils.testing_utils import (
@pytest.mark.big_gpu_with_torch_cuda
 class FluxReduxSlowTests(unittest.TestCase):
    pipeline_class = FluxPriorReduxPipeline
-    repo_id = "black-forest-labs/FLUX.1-Redux-dev"
+    repo_id = "YiYiXu/yiyi-redux"  # update to "black-forest-labs/FLUX.1-Redux-dev" once PR is merged
    base_pipeline_class = FluxPipeline
    base_repo_id = "black-forest-labs/FLUX.1-schnell"

@@ -70,82 +69,41 @@ class FluxReduxSlowTests(unittest.TestCase):
        image = pipe_base(**base_pipeline_inputs, **redux_pipeline_output).images[0]

        image_slice = image[0, :10, :10]
-        expected_slices = Expectations(
-            {
-                ("cuda", 7): np.array(
-                    [
-                        0.30078125,
-                        0.37890625,
-                        0.46875,
-                        0.28125,
-                        0.36914062,
-                        0.47851562,
-                        0.28515625,
-                        0.375,
-                        0.4765625,
-                        0.28125,
-                        0.375,
-                        0.48046875,
-                        0.27929688,
-                        0.37695312,
-                        0.47851562,
-                        0.27734375,
-                        0.38085938,
-                        0.4765625,
-                        0.2734375,
-                        0.38085938,
-                        0.47265625,
-                        0.27539062,
-                        0.37890625,
-                        0.47265625,
-                        0.27734375,
-                        0.37695312,
-                        0.47070312,
-                        0.27929688,
-                        0.37890625,
-                        0.47460938,
-                    ],
-                    dtype=np.float32,
-                ),
-                ("xpu", 3): np.array(
-                    [
-                        0.20507812,
-                        0.30859375,
-                        0.3984375,
-                        0.18554688,
-                        0.30078125,
-                        0.41015625,
-                        0.19921875,
-                        0.3125,
-                        0.40625,
-                        0.19726562,
-                        0.3125,
-                        0.41601562,
-                        0.19335938,
-                        0.31445312,
-                        0.4140625,
-                        0.1953125,
-                        0.3203125,
-                        0.41796875,
-                        0.19726562,
-                        0.32421875,
-                        0.41992188,
-                        0.19726562,
-                        0.32421875,
-                        0.41992188,
-                        0.20117188,
-                        0.32421875,
-                        0.41796875,
-                        0.203125,
-                        0.32617188,
-                        0.41796875,
-                    ],
-                    dtype=np.float32,
-                ),
-            }
+        expected_slice = np.array(
+            [
+                0.30078125,
+                0.37890625,
+                0.46875,
+                0.28125,
+                0.36914062,
+                0.47851562,
+                0.28515625,
+                0.375,
+                0.4765625,
+                0.28125,
+                0.375,
+                0.48046875,
+                0.27929688,
+                0.37695312,
+                0.47851562,
+                0.27734375,
+                0.38085938,
+                0.4765625,
+                0.2734375,
+                0.38085938,
+                0.47265625,
+                0.27539062,
+                0.37890625,
+                0.47265625,
+                0.27734375,
+                0.37695312,
+                0.47070312,
+                0.27929688,
+                0.37890625,
+                0.47460938,
+            ],
+            dtype=np.float32,
        )
-        expected_slice = expected_slices.get_expectation()
-
        max_diff = numpy_cosine_similarity_distance(expected_slice.flatten(), image_slice.flatten())

        assert max_diff < 1e-4
@@ -187,7 +187,7 @@ class I2VGenXLPipelineFastTests(SDFunctionTesterMixin, PipelineTesterMixin, unit
        super().test_sequential_cpu_offload_forward_pass(expected_max_diff=0.008)

    def test_dict_tuple_outputs_equivalent(self):
-        super().test_dict_tuple_outputs_equivalent(expected_max_difference=0.009)
+        super().test_dict_tuple_outputs_equivalent(expected_max_difference=0.008)

    def test_save_load_optional_components(self):
        super().test_save_load_optional_components(expected_max_difference=0.008)
@@ -34,7 +34,6 @@ from diffusers import (
 from diffusers.image_processor import IPAdapterMaskProcessor
 from diffusers.utils import load_image
 from diffusers.utils.testing_utils import (
-    Expectations,
    backend_empty_cache,
    enable_full_determinism,
    is_flaky,
@@ -665,50 +664,7 @@ class IPAdapterSDXLIntegrationTests(IPAdapterNightlyTestsMixin):
        images = pipeline(**inputs).images
        image_slice = images[0, :3, :3, -1].flatten()

-        expected_slices = Expectations(
-            {
-                ("xpu", 3): np.array(
-                    [
-                        0.2520,
-                        0.1050,
-                        0.1510,
-                        0.0997,
-                        0.0893,
-                        0.0019,
-                        0.0000,
-                        0.0000,
-                        0.0210,
-                    ]
-                ),
-                ("cuda", 7): np.array(
-                    [
-                        0.2323,
-                        0.1026,
-                        0.1338,
-                        0.0638,
-                        0.0662,
-                        0.0000,
-                        0.0000,
-                        0.0000,
-                        0.0199,
-                    ]
-                ),
-                ("cuda", 8): np.array(
-                    [
-                        0.2518,
-                        0.1059,
-                        0.1553,
-                        0.0977,
-                        0.0852,
-                        0.0000,
-                        0.0000,
-                        0.0000,
-                        0.0220,
-                    ]
-                ),
-            }
-        )
-        expected_slice = expected_slices.get_expectation()
+        expected_slice = np.array([0.2323, 0.1026, 0.1338, 0.0638, 0.0662, 0.0000, 0.0000, 0.0000, 0.0199])

        max_diff = numpy_cosine_similarity_distance(image_slice, expected_slice)
        assert max_diff < 5e-4
@@ -37,7 +37,6 @@ from diffusers import (
    UNet2DConditionModel,
 )
 from diffusers.utils.testing_utils import (
-    Expectations,
    backend_empty_cache,
    backend_max_memory_allocated,
    backend_reset_max_memory_allocated,
@@ -867,37 +866,7 @@ class StableDiffusionInpaintPipelineAsymmetricAutoencoderKLSlowTests(unittest.Te
        image_slice = image[0, 253:256, 253:256, -1].flatten()

        assert image.shape == (1, 512, 512, 3)
-        expected_slices = Expectations(
-            {
-                ("xpu", 3): np.array(
-                    [
-                        0.2063,
-                        0.1731,
-                        0.1553,
-                        0.1741,
-                        0.1772,
-                        0.1077,
-                        0.2109,
-                        0.2407,
-                        0.1243,
-                    ]
-                ),
-                ("cuda", 7): np.array(
-                    [
-                        0.1343,
-                        0.1406,
-                        0.1440,
-                        0.1504,
-                        0.1729,
-                        0.0989,
-                        0.1807,
-                        0.2822,
-                        0.1179,
-                    ]
-                ),
-            }
-        )
-        expected_slice = expected_slices.get_expectation()
+        expected_slice = np.array([0.1343, 0.1406, 0.1440, 0.1504, 0.1729, 0.0989, 0.1807, 0.2822, 0.1179])

        assert np.abs(expected_slice - image_slice).max() < 5e-2

@@ -381,7 +381,7 @@ class UnCLIPPipelineFastTests(PipelineTesterMixin, unittest.TestCase):
        ]

        self._test_inference_batch_single_identical(
-            additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, expected_max_diff=9.8e-3
+            additional_params_copy_to_batched_inputs=additional_params_copy_to_batched_inputs, expected_max_diff=5e-3
        )

    def test_inference_batch_consistent(self):
@@ -17,6 +17,8 @@
 import requests
 from packaging.version import parse

+from ..src.diffusers.utils.constants import DIFFUSERS_REQUEST_TIMEOUT
+

 # GitHub repository details
 USER = "huggingface"
@@ -31,7 +33,7 @@ def fetch_all_branches(user, repo):
        response = requests.get(
            f"https://api.github.com/repos/{user}/{repo}/branches",
            params={"page": page},
-            timeout=60,
+            timeout=DIFFUSERS_REQUEST_TIMEOUT,
        )

        # Check if the request was successful
@@ -17,6 +17,8 @@ import os

 import requests

+from ..src.diffusers.utils.constants import DIFFUSERS_REQUEST_TIMEOUT
+

 # Configuration
 LIBRARY_NAME = "diffusers"
@@ -26,7 +28,7 @@ SLACK_WEBHOOK_URL = os.getenv("SLACK_WEBHOOK_URL")

 def check_pypi_for_latest_release(library_name):
    """Check PyPI for the latest release of the library."""
-    response = requests.get(f"https://pypi.org/pypi/{library_name}/json", timeout=60)
+    response = requests.get(f"https://pypi.org/pypi/{library_name}/json", timeout=DIFFUSERS_REQUEST_TIMEOUT)
    if response.status_code == 200:
        data = response.json()
        return data["info"]["version"]
@@ -38,7 +40,7 @@ def check_pypi_for_latest_release(library_name):
 def get_github_release_info(github_repo):
    """Fetch the latest release info from GitHub."""
    url = f"https://api.github.com/repos/{github_repo}/releases/latest"
-    response = requests.get(url, timeout=60)
+    response = requests.get(url, timeout=DIFFUSERS_REQUEST_TIMEOUT)

    if response.status_code == 200:
        data = response.json()
Author	SHA1	Message	Date
DN6	b365801c57	update	2025-04-09 15:34:42 +05:30
DN6	644147a198	Merge branch 'main' into ruff-update	2025-04-09 15:22:55 +05:30
DN6	c852f239f2	update	2025-03-08 08:17:14 +05:30
DN6	be861e236f	update	2025-03-08 08:07:10 +05:30
DN6	2d744f0707	Merge branch 'main' into ruff-update	2025-03-08 08:05:08 +05:30
DN6	41c7e72d44	update	2025-02-27 17:08:37 +05:30