Merge branch 'main' into custom-block-local-code

update
2025-10-30 12:11:17 -10:00 · 2025-09-24 14:33:38 +05:30
27 changed files with 78 additions and 457 deletions
@@ -529,6 +529,8 @@
        title: Kandinsky 2.2
      - local: api/pipelines/kandinsky3
        title: Kandinsky 3
+      - local: api/pipelines/kandinsky5
+        title: Kandinsky 5
      - local: api/pipelines/kolors
        title: Kolors
      - local: api/pipelines/latent_consistency_models
@@ -636,8 +638,6 @@
        title: HunyuanVideo
      - local: api/pipelines/i2vgenxl
        title: I2VGen-XL
-      - local: api/pipelines/kandinsky5_video
-        title: Kandinsky 5.0 Video
      - local: api/pipelines/latte
        title: Latte
      - local: api/pipelines/ltx_video
@@ -7,9 +7,9 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
 specific language governing permissions and limitations under the License.
 -->

-# Kandinsky 5.0 Video
+# Kandinsky 5.0

-Kandinsky 5.0 Video is created by the Kandinsky team: Alexey Letunovskiy, Maria Kovaleva, Ivan Kirillov, Lev Novitskiy, Denis Koposov, Dmitrii Mikhailov, Anna Averchenkova, Andrey Shutkin, Julia Agafonova, Olga Kim, Anastasiia Kargapoltseva, Nikita Kiselev, Anna Dmitrienko,  Anastasia Maltseva, Kirill Chernyshev, Ilia Vasiliev, Viacheslav Vasilev, Vladimir Polovnikov, Yury Kolabushin, Alexander Belykh, Mikhail Mamaev, Anastasia Aliaskina, Tatiana Nikulina, Polina Gavrilova, Vladimir Arkhipkin, Vladimir Korviakov, Nikolai Gerasimenko, Denis Parkhomenko, Denis Dimitrov
+Kandinsky 5.0 is created by the Kandinsky team: Alexey Letunovskiy, Maria Kovaleva, Ivan Kirillov, Lev Novitskiy, Denis Koposov, Dmitrii Mikhailov, Anna Averchenkova, Andrey Shutkin, Julia Agafonova, Olga Kim, Anastasiia Kargapoltseva, Nikita Kiselev, Anna Dmitrienko,  Anastasia Maltseva, Kirill Chernyshev, Ilia Vasiliev, Viacheslav Vasilev, Vladimir Polovnikov, Yury Kolabushin, Alexander Belykh, Mikhail Mamaev, Anastasia Aliaskina, Tatiana Nikulina, Polina Gavrilova, Vladimir Arkhipkin, Vladimir Korviakov, Nikolai Gerasimenko, Denis Parkhomenko, Denis Dimitrov


 Kandinsky 5.0 is a family of diffusion models for Video & Image generation. Kandinsky 5.0 T2V Lite is a lightweight video generation model (2B parameters) that ranks #1 among open-source models in its class. It outperforms larger models and offers the best understanding of Russian concepts in the open-source ecosystem.
@@ -92,7 +92,7 @@ pipe = pipe.to("cuda")

 pipe.transformer.set_attention_backend(
    "flex"
-)                                       # <--- Sett attention bakend to Flex
+)                                       # <--- Set attention backend to Flex
 pipe.transformer.compile(
    mode="max-autotune-no-cudagraphs", 
    dynamic=True
@@ -115,7 +115,7 @@ export_to_video(output, "output.mp4", fps=24, quality=9)
 ```

 ### Diffusion Distilled model
-**⚠️ Warning!** all nocfg and diffusion distilled models should be infered wothout CFG (```guidance_scale=1.0```):
+**⚠️ Warning!** all nocfg and diffusion distilled models should be inferred without CFG (```guidance_scale=1.0```):

 ```python
 model_id = "ai-forever/Kandinsky-5.0-T2V-Lite-distilled16steps-5s-Diffusers"
@@ -104,8 +104,6 @@ To use your own dataset, there are 2 ways:
 - you can either provide your own folder as `--train_data_dir`
 - or you can upload your dataset to the hub (possibly as a private repo, if you prefer so), and simply pass the `--dataset_name` argument.

-If your dataset contains 16 or 32-bit channels (for example, medical TIFFs), add the `--preserve_input_precision` flag so the preprocessing keeps the original precision while still training a 3-channel model. Precision still depends on the decoder: Pillow keeps 16-bit grayscale and float inputs, but many 16-bit RGB files are decoded as 8-bit RGB, and the flag cannot recover precision lost at load time.
-
 Below, we explain both in more detail.

 #### Provide the dataset as a folder
@@ -52,24 +52,6 @@ def _extract_into_tensor(arr, timesteps, broadcast_shape):
    return res.expand(broadcast_shape)


-def _ensure_three_channels(tensor: torch.Tensor) -> torch.Tensor:
-    """
-    Ensure the tensor has exactly three channels (C, H, W) by repeating or truncating channels when needed.
-    """
-    if tensor.ndim == 2:
-        tensor = tensor.unsqueeze(0)
-    channels = tensor.shape[0]
-    if channels == 3:
-        return tensor
-    if channels == 1:
-        return tensor.repeat(3, 1, 1)
-    if channels == 2:
-        return torch.cat([tensor, tensor[:1]], dim=0)
-    if channels > 3:
-        return tensor[:3]
-    raise ValueError(f"Unsupported number of channels: {channels}")
-
-
 def parse_args():
    parser = argparse.ArgumentParser(description="Simple example of a training script.")
    parser.add_argument(
@@ -278,11 +260,6 @@ def parse_args():
    parser.add_argument(
        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
    )
-    parser.add_argument(
-        "--preserve_input_precision",
-        action="store_true",
-        help="Preserve 16/32-bit image precision by avoiding 8-bit RGB conversion while still producing 3-channel tensors.",
-    )

    args = parser.parse_args()
    env_local_rank = int(os.environ.get("LOCAL_RANK", -1))
@@ -476,41 +453,19 @@ def main(args):
        # https://huggingface.co/docs/datasets/v2.4.0/en/image_load#imagefolder

    # Preprocessing the datasets and DataLoaders creation.
-    spatial_augmentations = [
-        transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
-        transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
-        transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
-    ]
-
    augmentations = transforms.Compose(
-        spatial_augmentations
-        + [
+        [
+            transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution),
+            transforms.RandomHorizontalFlip() if args.random_flip else transforms.Lambda(lambda x: x),
            transforms.ToTensor(),
            transforms.Normalize([0.5], [0.5]),
        ]
    )

-    precision_augmentations = transforms.Compose(
-        [
-            transforms.PILToTensor(),
-            transforms.Lambda(_ensure_three_channels),
-            transforms.ConvertImageDtype(torch.float32),
-        ]
-        + spatial_augmentations
-        + [transforms.Normalize([0.5], [0.5])]
-    )
-
    def transform_images(examples):
-        processed = []
-        for image in examples["image"]:
-            if not args.preserve_input_precision:
-                processed.append(augmentations(image.convert("RGB")))
-            else:
-                precise_image = image
-                if precise_image.mode == "P":
-                    precise_image = precise_image.convert("RGB")
-                processed.append(precision_augmentations(precise_image))
-        return {"input": processed}
+        images = [augmentations(image.convert("RGB")) for image in examples["image"]]
+        return {"input": images}

    logger.info(f"Dataset size: {len(dataset)}")

@@ -89,6 +89,8 @@ class CustomBlocksCommand(BaseDiffusersCLICommand):
        # automap = self._create_automap(parent_class=parent_class, child_class=child_class)
        # with open(CONFIG, "w") as f:
        #     json.dump(automap, f)
+        with open("requirements.txt", "w") as f:
+            f.write("")

    def _choose_block(self, candidates, chosen=None):
        for cls, base in candidates:
@@ -2213,10 +2213,6 @@ def _convert_non_diffusers_qwen_lora_to_diffusers(state_dict):

        state_dict = {convert_key(k): v for k, v in state_dict.items()}

-    has_default = any("default." in k for k in state_dict)
-    if has_default:
-        state_dict = {k.replace("default.", ""): v for k, v in state_dict.items()}
-
    converted_state_dict = {}
    all_keys = list(state_dict.keys())
    down_key = ".lora_down.weight"
@@ -4940,8 +4940,7 @@ class QwenImageLoraLoaderMixin(LoraBaseMixin):
        has_alphas_in_sd = any(k.endswith(".alpha") for k in state_dict)
        has_lora_unet = any(k.startswith("lora_unet_") for k in state_dict)
        has_diffusion_model = any(k.startswith("diffusion_model.") for k in state_dict)
-        has_default = any("default." in k for k in state_dict)
-        if has_alphas_in_sd or has_lora_unet or has_diffusion_model or has_default:
+        if has_alphas_in_sd or has_lora_unet or has_diffusion_model:
            state_dict = _convert_non_diffusers_qwen_lora_to_diffusers(state_dict)

        out = (state_dict, metadata) if return_lora_metadata else state_dict
@@ -649,86 +649,6 @@ def _(
 # ===== Helper functions to use attention backends with templated CP autograd functions =====


-def _native_attention_forward_op(
-    ctx: torch.autograd.function.FunctionCtx,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attn_mask: Optional[torch.Tensor] = None,
-    dropout_p: float = 0.0,
-    is_causal: bool = False,
-    scale: Optional[float] = None,
-    enable_gqa: bool = False,
-    return_lse: bool = False,
-    _save_ctx: bool = True,
-    _parallel_config: Optional["ParallelConfig"] = None,
-):
-    # Native attention does not return_lse
-    if return_lse:
-        raise ValueError("Native attention does not support return_lse=True")
-
-    # used for backward pass
-    if _save_ctx:
-        ctx.save_for_backward(query, key, value)
-        ctx.attn_mask = attn_mask
-        ctx.dropout_p = dropout_p
-        ctx.is_causal = is_causal
-        ctx.scale = scale
-        ctx.enable_gqa = enable_gqa
-
-    query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value))
-    out = torch.nn.functional.scaled_dot_product_attention(
-        query=query,
-        key=key,
-        value=value,
-        attn_mask=attn_mask,
-        dropout_p=dropout_p,
-        is_causal=is_causal,
-        scale=scale,
-        enable_gqa=enable_gqa,
-    )
-    out = out.permute(0, 2, 1, 3)
-
-    return out
-
-
-def _native_attention_backward_op(
-    ctx: torch.autograd.function.FunctionCtx,
-    grad_out: torch.Tensor,
-    *args,
-    **kwargs,
-):
-    query, key, value = ctx.saved_tensors
-
-    query.requires_grad_(True)
-    key.requires_grad_(True)
-    value.requires_grad_(True)
-
-    query_t, key_t, value_t = (x.permute(0, 2, 1, 3) for x in (query, key, value))
-    out = torch.nn.functional.scaled_dot_product_attention(
-        query=query_t,
-        key=key_t,
-        value=value_t,
-        attn_mask=ctx.attn_mask,
-        dropout_p=ctx.dropout_p,
-        is_causal=ctx.is_causal,
-        scale=ctx.scale,
-        enable_gqa=ctx.enable_gqa,
-    )
-    out = out.permute(0, 2, 1, 3)
-
-    grad_out_t = grad_out.permute(0, 2, 1, 3)
-    grad_query_t, grad_key_t, grad_value_t = torch.autograd.grad(
-        outputs=out, inputs=[query_t, key_t, value_t], grad_outputs=grad_out_t, retain_graph=False
-    )
-
-    grad_query = grad_query_t.permute(0, 2, 1, 3)
-    grad_key = grad_key_t.permute(0, 2, 1, 3)
-    grad_value = grad_value_t.permute(0, 2, 1, 3)
-
-    return grad_query, grad_key, grad_value
-
-
 # https://github.com/pytorch/pytorch/blob/8904ba638726f8c9a5aff5977c4aa76c9d2edfa6/aten/src/ATen/native/native_functions.yaml#L14958
 # forward declaration:
 #   aten::_scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, Tensor? attn_bias, bool compute_log_sumexp, float dropout_p=0., bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask)
@@ -1603,7 +1523,6 @@ def _native_flex_attention(
@_AttentionBackendRegistry.register(
    AttentionBackendName.NATIVE,
    constraints=[_check_device, _check_shape],
-    supports_context_parallel=True,
 )
 def _native_attention(
    query: torch.Tensor,
@@ -1619,35 +1538,18 @@ def _native_attention(
 ) -> torch.Tensor:
    if return_lse:
        raise ValueError("Native attention backend does not support setting `return_lse=True`.")
-    if _parallel_config is None:
-        query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value))
-        out = torch.nn.functional.scaled_dot_product_attention(
-            query=query,
-            key=key,
-            value=value,
-            attn_mask=attn_mask,
-            dropout_p=dropout_p,
-            is_causal=is_causal,
-            scale=scale,
-            enable_gqa=enable_gqa,
-        )
-        out = out.permute(0, 2, 1, 3)
-    else:
-        out = _templated_context_parallel_attention(
-            query,
-            key,
-            value,
-            attn_mask,
-            dropout_p,
-            is_causal,
-            scale,
-            enable_gqa,
-            return_lse,
-            forward_op=_native_attention_forward_op,
-            backward_op=_native_attention_backward_op,
-            _parallel_config=_parallel_config,
-        )
-
+    query, key, value = (x.permute(0, 2, 1, 3) for x in (query, key, value))
+    out = torch.nn.functional.scaled_dot_product_attention(
+        query=query,
+        key=key,
+        value=value,
+        attn_mask=attn_mask,
+        dropout_p=dropout_p,
+        is_causal=is_causal,
+        scale=scale,
+        enable_gqa=enable_gqa,
+    )
+    out = out.permute(0, 2, 1, 3)
    return out


@@ -147,13 +147,14 @@ class AutoModel(ConfigMixin):
            "force_download",
            "local_files_only",
            "proxies",
+            "resume_download",
            "revision",
            "token",
        ]
        hub_kwargs = {name: kwargs.pop(name, None) for name in hub_kwargs_names}

        # load_config_kwargs uses the same hub kwargs minus subfolder and resume_download
-        load_config_kwargs = {k: v for k, v in hub_kwargs.items() if k not in ["subfolder"]}
+        load_config_kwargs = {k: v for k, v in hub_kwargs.items() if k not in ["subfolder", "resume_download"]}

        library = None
        orig_class_name = None
@@ -204,6 +205,7 @@ class AutoModel(ConfigMixin):
                module_file=module_file,
                class_name=class_name,
                **hub_kwargs,
+                **kwargs,
            )
        else:
            from ..pipelines.pipeline_loading_utils import ALL_IMPORTABLE_CLASSES, get_class_obj_and_candidates
@@ -164,11 +164,7 @@ class AutoOffloadStrategy:

        device_type = execution_device.type
        device_module = getattr(torch, device_type, torch.cuda)
-        try:
-            mem_on_device = device_module.mem_get_info(execution_device.index)[0]
-        except AttributeError:
-            raise AttributeError(f"Do not know how to obtain obtain memory info for {str(device_module)}.")
-
+        mem_on_device = device_module.mem_get_info(execution_device.index)[0]
        mem_on_device = mem_on_device - self.memory_reserve_margin
        if current_module_size < mem_on_device:
            return []
@@ -703,8 +699,6 @@ class ComponentsManager:
        if not is_accelerate_available():
            raise ImportError("Make sure to install accelerate to use auto_cpu_offload")

-        # TODO: add a warning if mem_get_info isn't available on `device`.
-
        for name, component in self.components.items():
            if isinstance(component, torch.nn.Module) and hasattr(component, "_hf_hook"):
                remove_hook_from_module(component, recurse=True)
@@ -598,7 +598,7 @@ class FluxKontextRoPEInputsStep(ModularPipelineBlocks):
            and getattr(block_state, "image_width", None) is not None
        ):
            image_latent_height = 2 * (int(block_state.image_height) // (components.vae_scale_factor * 2))
-            image_latent_width = 2 * (int(block_state.image_width) // (components.vae_scale_factor * 2))
+            image_latent_width = 2 * (int(block_state.width) // (components.vae_scale_factor * 2))
            img_ids = FluxPipeline._prepare_latent_image_ids(
                None, image_latent_height // 2, image_latent_width // 2, device, dtype
            )
@@ -59,7 +59,7 @@ class FluxLoopDenoiser(ModularPipelineBlocks):
            ),
            InputParam(
                "guidance",
-                required=False,
+                required=True,
                type_hint=torch.Tensor,
                description="Guidance scale as a tensor",
            ),
@@ -141,7 +141,7 @@ class FluxKontextLoopDenoiser(ModularPipelineBlocks):
            ),
            InputParam(
                "guidance",
-                required=False,
+                required=True,
                type_hint=torch.Tensor,
                description="Guidance scale as a tensor",
            ),
@@ -95,7 +95,7 @@ class FluxProcessImagesInputStep(ModularPipelineBlocks):
            ComponentSpec(
                "image_processor",
                VaeImageProcessor,
-                config=FrozenDict({"vae_scale_factor": 16, "vae_latent_channels": 16}),
+                config=FrozenDict({"vae_scale_factor": 16}),
                default_creation_method="from_config",
            ),
        ]
@@ -143,6 +143,10 @@ class FluxProcessImagesInputStep(ModularPipelineBlocks):
 class FluxKontextProcessImagesInputStep(ModularPipelineBlocks):
    model_name = "flux-kontext"

+    def __init__(self, _auto_resize=True):
+        self._auto_resize = _auto_resize
+        super().__init__()
+
    @property
    def description(self) -> str:
        return (
@@ -163,7 +167,7 @@ class FluxKontextProcessImagesInputStep(ModularPipelineBlocks):

    @property
    def inputs(self) -> List[InputParam]:
-        return [InputParam("image"), InputParam("_auto_resize", type_hint=bool, default=True)]
+        return [InputParam("image")]

    @property
    def intermediate_outputs(self) -> List[OutputParam]:
@@ -191,8 +195,7 @@ class FluxKontextProcessImagesInputStep(ModularPipelineBlocks):
            img = images[0]
            image_height, image_width = components.image_processor.get_default_height_width(img)
            aspect_ratio = image_width / image_height
-            _auto_resize = block_state._auto_resize
-            if _auto_resize:
+            if self._auto_resize:
                # Kontext is trained on specific resolutions, using one of them is recommended
                _, image_width, image_height = min(
                    (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_KONTEXT_RESOLUTIONS
@@ -112,10 +112,6 @@ class FluxTextInputStep(ModularPipelineBlocks):
        block_state.prompt_embeds = block_state.prompt_embeds.view(
            block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
        )
-        pooled_prompt_embeds = block_state.pooled_prompt_embeds.repeat(1, block_state.num_images_per_prompt)
-        block_state.pooled_prompt_embeds = pooled_prompt_embeds.view(
-            block_state.batch_size * block_state.num_images_per_prompt, -1
-        )
        self.set_block_state(state, block_state)

        return components, state
@@ -39,7 +39,6 @@ from .modular_pipeline_utils import (
    InputParam,
    InsertableDict,
    OutputParam,
-    _validate_requirements,
    format_components,
    format_configs,
    make_doc_string,
@@ -240,7 +239,6 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):

    config_name = "modular_config.json"
    model_name = None
-    _requirements: Optional[Dict[str, str]] = None

    @classmethod
    def _get_signature_keys(cls, obj):
@@ -303,31 +301,29 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
        trust_remote_code: bool = False,
        **kwargs,
    ):
-        config = cls.load_config(pretrained_model_name_or_path)
-        has_remote_code = "auto_map" in config and cls.__name__ in config["auto_map"]
-        trust_remote_code = resolve_trust_remote_code(
-            trust_remote_code, pretrained_model_name_or_path, has_remote_code
-        )
-        if not (has_remote_code and trust_remote_code):
-            raise ValueError(
-                "Selected model repository does not happear to have any custom code or does not have a valid `config.json` file."
-            )
-
-        if "requirements" in config and config["requirements"] is not None:
-            _ = _validate_requirements(config["requirements"])
-
        hub_kwargs_names = [
            "cache_dir",
            "force_download",
            "local_files_only",
            "local_dir",
            "proxies",
+            "resume_download",
            "revision",
            "subfolder",
            "token",
        ]
        hub_kwargs = {name: kwargs.pop(name) for name in hub_kwargs_names if name in kwargs}

+        config = cls.load_config(pretrained_model_name_or_path)
+        has_remote_code = "auto_map" in config and cls.__name__ in config["auto_map"]
+        trust_remote_code = resolve_trust_remote_code(
+            trust_remote_code, pretrained_model_name_or_path, has_remote_code
+        )
+        if not has_remote_code and trust_remote_code:
+            raise ValueError(
+                "Selected model repository does not happear to have any custom code or does not have a valid `config.json` file."
+            )
+
        class_ref = config["auto_map"][cls.__name__]
        module_file, class_name = class_ref.split(".")
        module_file = module_file + ".py"
@@ -352,13 +348,8 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
        module = full_mod.rsplit(".", 1)[-1].replace("__dynamic__", "")
        parent_module = self.save_pretrained.__func__.__qualname__.split(".", 1)[0]
        auto_map = {f"{parent_module}": f"{module}.{cls_name}"}
+
        self.register_to_config(auto_map=auto_map)
-
-        # resolve requirements
-        requirements = _validate_requirements(getattr(self, "_requirements", None))
-        if requirements:
-            self.register_to_config(requirements=requirements)
-
        self.save_config(save_directory=save_directory, push_to_hub=push_to_hub, **kwargs)
        config = dict(self.config)
        self._internal_dict = FrozenDict(config)
@@ -1142,14 +1133,6 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
            expected_configs=self.expected_configs,
        )

-    @property
-    def _requirements(self) -> Dict[str, str]:
-        requirements = {}
-        for block_name, block in self.sub_blocks.items():
-            if getattr(block, "_requirements", None):
-                requirements[block_name] = block._requirements
-        return requirements
-

 class LoopSequentialPipelineBlocks(ModularPipelineBlocks):
    """
@@ -2148,13 +2131,8 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
                        component_load_kwargs[key] = value["default"]
            try:
                components_to_register[name] = spec.load(**component_load_kwargs)
-            except Exception:
-                logger.warning(
-                    f"\nFailed to create component {name}:\n"
-                    f"- Component spec: {spec}\n"
-                    f"- load() called with kwargs: {component_load_kwargs}\n\n"
-                    f"{traceback.format_exc()}"
-                )
+            except Exception as e:
+                logger.warning(f"Failed to create component '{name}': {e}")

        # Register all components at once
        self.register_components(**components_to_register)
@@ -19,11 +19,9 @@ from dataclasses import dataclass, field, fields
 from typing import Any, Dict, List, Literal, Optional, Type, Union

 import torch
-from packaging.specifiers import InvalidSpecifier, SpecifierSet

 from ..configuration_utils import ConfigMixin, FrozenDict
 from ..utils import is_torch_available, logging
-from ..utils.import_utils import _is_package_available


 if is_torch_available():
@@ -672,86 +670,3 @@ def make_doc_string(
    output += format_output_params(outputs, indent_level=2)

    return output
-
-
-def _validate_requirements(reqs):
-    if reqs is None:
-        normalized_reqs = {}
-    else:
-        if not isinstance(reqs, dict):
-            raise ValueError(
-                "Requirements must be provided as a dictionary mapping package names to version specifiers."
-            )
-        normalized_reqs = _normalize_requirements(reqs)
-
-    if not normalized_reqs:
-        return {}
-
-    final: Dict[str, str] = {}
-    for req, specified_ver in normalized_reqs.items():
-        req_available, req_actual_ver = _is_package_available(req)
-        if not req_available:
-            logger.warning(f"{req} was specified in the requirements but wasn't found in the current environment.")
-
-        if specified_ver:
-            try:
-                specifier = SpecifierSet(specified_ver)
-            except InvalidSpecifier as err:
-                raise ValueError(f"Requirement specifier '{specified_ver}' for {req} is invalid.") from err
-
-            if req_actual_ver == "N/A":
-                logger.warning(
-                    f"Version of {req} could not be determined to validate requirement '{specified_ver}'. Things might work unexpected."
-                )
-            elif not specifier.contains(req_actual_ver, prereleases=True):
-                logger.warning(
-                    f"{req} requirement '{specified_ver}' is not satisfied by the installed version {req_actual_ver}. Things might work unexpected."
-                )
-
-        final[req] = specified_ver
-
-    return final
-
-
-def _normalize_requirements(reqs):
-    if not reqs:
-        return {}
-
-    normalized: "OrderedDict[str, str]" = OrderedDict()
-
-    def _accumulate(mapping: Dict[str, Any]):
-        for pkg, spec in mapping.items():
-            if isinstance(spec, dict):
-                # This is recursive because blocks are composable. This way, we can merge requirements
-                # from multiple blocks.
-                _accumulate(spec)
-                continue
-
-            pkg_name = str(pkg).strip()
-            if not pkg_name:
-                raise ValueError("Requirement package name cannot be empty.")
-
-            spec_str = "" if spec is None else str(spec).strip()
-            if spec_str and not spec_str.startswith(("<", ">", "=", "!", "~")):
-                spec_str = f"=={spec_str}"
-
-            existing_spec = normalized.get(pkg_name)
-            if existing_spec is not None:
-                if not existing_spec and spec_str:
-                    normalized[pkg_name] = spec_str
-                elif existing_spec and spec_str and existing_spec != spec_str:
-                    try:
-                        combined_spec = SpecifierSet(",".join(filter(None, [existing_spec, spec_str])))
-                    except InvalidSpecifier:
-                        logger.warning(
-                            f"Conflicting requirements for '{pkg_name}' detected: '{existing_spec}' vs '{spec_str}'. Keeping '{existing_spec}'."
-                        )
-                    else:
-                        normalized[pkg_name] = str(combined_spec)
-                continue
-
-            normalized[pkg_name] = spec_str
-
-    _accumulate(reqs)
-
-    return normalized
@@ -355,7 +355,7 @@ class StableDiffusion3ControlNetPipeline(
        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)

        return prompt_embeds, pooled_prompt_embeds
@@ -373,7 +373,7 @@ class StableDiffusion3ControlNetInpaintingPipeline(
        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)

        return prompt_embeds, pooled_prompt_embeds
@@ -326,7 +326,7 @@ class StableDiffusion3PAGPipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSin
        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)

        return prompt_embeds, pooled_prompt_embeds
@@ -342,7 +342,7 @@ class StableDiffusion3PAGImg2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin,
        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)

        return prompt_embeds, pooled_prompt_embeds
@@ -336,7 +336,7 @@ class StableDiffusion3Pipeline(DiffusionPipeline, SD3LoraLoaderMixin, FromSingle
        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)

        return prompt_embeds, pooled_prompt_embeds
@@ -361,7 +361,7 @@ class StableDiffusion3Img2ImgPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)

        return prompt_embeds, pooled_prompt_embeds
@@ -367,7 +367,7 @@ class StableDiffusion3InpaintPipeline(DiffusionPipeline, SD3LoraLoaderMixin, Fro
        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

-        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt)
+        pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
        pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1)

        return prompt_embeds, pooled_prompt_embeds
@@ -358,7 +358,6 @@ def get_cached_module_file(
                proxies=proxies,
                local_files_only=local_files_only,
                local_dir=local_dir,
-                revision=revision,
                token=token,
            )
            submodule = os.path.join("local", "--".join(pretrained_model_name_or_path.split("/")))
@@ -1,130 +0,0 @@
-# coding=utf-8
-# Copyright 2025 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import random
-import tempfile
-import unittest
-
-import numpy as np
-import PIL
-import torch
-
-from diffusers.image_processor import VaeImageProcessor
-from diffusers.modular_pipelines import (
-    FluxAutoBlocks,
-    FluxKontextAutoBlocks,
-    FluxKontextModularPipeline,
-    FluxModularPipeline,
-    ModularPipeline,
-)
-
-from ...testing_utils import floats_tensor, torch_device
-from ..test_modular_pipelines_common import ModularPipelineTesterMixin
-
-
-class FluxModularTests:
-    pipeline_class = FluxModularPipeline
-    pipeline_blocks_class = FluxAutoBlocks
-    repo = "hf-internal-testing/tiny-flux-modular"
-
-    def get_pipeline(self, components_manager=None, torch_dtype=torch.float32):
-        pipeline = self.pipeline_blocks_class().init_pipeline(self.repo, components_manager=components_manager)
-        pipeline.load_components(torch_dtype=torch_dtype)
-        return pipeline
-
-    def get_dummy_inputs(self, device, seed=0):
-        if str(device).startswith("mps"):
-            generator = torch.manual_seed(seed)
-        else:
-            generator = torch.Generator(device=device).manual_seed(seed)
-        inputs = {
-            "prompt": "A painting of a squirrel eating a burger",
-            "generator": generator,
-            "num_inference_steps": 2,
-            "guidance_scale": 5.0,
-            "height": 8,
-            "width": 8,
-            "max_sequence_length": 48,
-            "output_type": "np",
-        }
-        return inputs
-
-
-class FluxModularPipelineFastTests(FluxModularTests, ModularPipelineTesterMixin, unittest.TestCase):
-    params = frozenset(["prompt", "height", "width", "guidance_scale"])
-    batch_params = frozenset(["prompt"])
-
-
-class FluxImg2ImgModularPipelineFastTests(FluxModularTests, ModularPipelineTesterMixin, unittest.TestCase):
-    params = frozenset(["prompt", "height", "width", "guidance_scale", "image"])
-    batch_params = frozenset(["prompt", "image"])
-
-    def get_pipeline(self, components_manager=None, torch_dtype=torch.float32):
-        pipeline = super().get_pipeline(components_manager, torch_dtype)
-        # Override `vae_scale_factor` here as currently, `image_processor` is initialized with
-        # fixed constants instead of
-        # https://github.com/huggingface/diffusers/blob/d54622c2679d700b425ad61abce9b80fc36212c0/src/diffusers/pipelines/flux/pipeline_flux_img2img.py#L230C9-L232C10
-        pipeline.image_processor = VaeImageProcessor(vae_scale_factor=2)
-        return pipeline
-
-    def get_dummy_inputs(self, device, seed=0):
-        inputs = super().get_dummy_inputs(device, seed)
-        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
-        image = image / 2 + 0.5
-        inputs["image"] = image
-        inputs["strength"] = 0.8
-        inputs["height"] = 8
-        inputs["width"] = 8
-        return inputs
-
-    def test_save_from_pretrained(self):
-        pipes = []
-        base_pipe = self.get_pipeline().to(torch_device)
-        pipes.append(base_pipe)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            base_pipe.save_pretrained(tmpdirname)
-            pipe = ModularPipeline.from_pretrained(tmpdirname).to(torch_device)
-            pipe.load_components(torch_dtype=torch.float32)
-            pipe.to(torch_device)
-            pipe.image_processor = VaeImageProcessor(vae_scale_factor=2)
-
-        pipes.append(pipe)
-
-        image_slices = []
-        for pipe in pipes:
-            inputs = self.get_dummy_inputs(torch_device)
-            image = pipe(**inputs, output="images")
-
-            image_slices.append(image[0, -3:, -3:, -1].flatten())
-
-        assert np.abs(image_slices[0] - image_slices[1]).max() < 1e-3
-
-
-class FluxKontextModularPipelineFastTests(FluxImg2ImgModularPipelineFastTests):
-    pipeline_class = FluxKontextModularPipeline
-    pipeline_blocks_class = FluxKontextAutoBlocks
-    repo = "hf-internal-testing/tiny-flux-kontext-pipe"
-
-    def get_dummy_inputs(self, device, seed=0):
-        inputs = super().get_dummy_inputs(device, seed)
-        image = PIL.Image.new("RGB", (32, 32), 0)
-        _ = inputs.pop("strength")
-        inputs["image"] = image
-        inputs["height"] = 8
-        inputs["width"] = 8
-        inputs["max_area"] = 8 * 8
-        inputs["_auto_resize"] = False
-        return inputs
@@ -21,12 +21,24 @@ import numpy as np
 import torch
 from PIL import Image

-from diffusers import ClassifierFreeGuidance, StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline
+from diffusers import (
+    ClassifierFreeGuidance,
+    StableDiffusionXLAutoBlocks,
+    StableDiffusionXLModularPipeline,
+)
 from diffusers.loaders import ModularIPAdapterMixin

-from ...models.unets.test_models_unet_2d_condition import create_ip_adapter_state_dict
-from ...testing_utils import enable_full_determinism, floats_tensor, torch_device
-from ..test_modular_pipelines_common import ModularPipelineTesterMixin
+from ...models.unets.test_models_unet_2d_condition import (
+    create_ip_adapter_state_dict,
+)
+from ...testing_utils import (
+    enable_full_determinism,
+    floats_tensor,
+    torch_device,
+)
+from ..test_modular_pipelines_common import (
+    ModularPipelineTesterMixin,
+)


 enable_full_determinism()
Author	SHA1	Message	Date
YiYi Xu	0b6dcf696a	Merge branch 'main' into custom-block-local-code	2025-10-30 12:11:17 -10:00
DN6	351b2f172a	update	2025-09-24 14:33:38 +05:30