Merge branch 'main' into torch-main-dep

This commit is contained in:
Sayak Paul
2025-09-12 15:46:59 +05:30
committed by GitHub
482 changed files with 12154 additions and 1983 deletions
+3
View File
@@ -340,6 +340,9 @@ jobs:
- backend: "optimum_quanto"
test_location: "quanto"
additional_deps: []
- backend: "nvidia_modelopt"
test_location: "modelopt"
additional_deps: []
runs-on:
group: aws-g6e-xlarge-plus
container:
+1 -9
View File
@@ -37,7 +37,7 @@ limitations under the License.
## Installation
We recommend installing 🤗 Diffusers in a virtual environment from PyPI or Conda. For more details about installing [PyTorch](https://pytorch.org/get-started/locally/) and [Flax](https://flax.readthedocs.io/en/latest/#installation), please refer to their official documentation.
We recommend installing 🤗 Diffusers in a virtual environment from PyPI or Conda. For more details about installing [PyTorch](https://pytorch.org/get-started/locally/), please refer to their official documentation.
### PyTorch
@@ -53,14 +53,6 @@ With `conda` (maintained by the community):
conda install -c conda-forge diffusers
```
### Flax
With `pip` (official package):
```bash
pip install --upgrade diffusers[flax]
```
### Apple Silicon (M1/M2) support
Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggingface.co/docs/diffusers/optimization/mps) guide.
+8 -12
View File
@@ -21,15 +21,17 @@
- local: using-diffusers/callback
title: Pipeline callbacks
- local: using-diffusers/reusing_seeds
title: Reproducible pipelines
title: Reproducibility
- local: using-diffusers/schedulers
title: Load schedulers and models
- local: using-diffusers/models
title: Models
- local: using-diffusers/scheduler_features
title: Scheduler features
- local: using-diffusers/other-formats
title: Model files and layouts
- local: using-diffusers/push_to_hub
title: Push files to the Hub
title: Sharing pipelines and models
- title: Adapters
isExpanded: false
@@ -58,14 +60,6 @@
title: Batch inference
- local: training/distributed_inference
title: Distributed inference
- local: using-diffusers/scheduler_features
title: Scheduler features
- local: using-diffusers/callback
title: Pipeline callbacks
- local: using-diffusers/reusing_seeds
title: Reproducible pipelines
- local: using-diffusers/image_quality
title: Controlling image quality
- title: Inference optimization
isExpanded: false
@@ -94,6 +88,8 @@
title: xDiT
- local: optimization/para_attn
title: ParaAttention
- local: using-diffusers/image_quality
title: FreeU
- title: Hybrid Inference
isExpanded: false
@@ -190,12 +186,12 @@
title: torchao
- local: quantization/quanto
title: quanto
- local: quantization/modelopt
title: NVIDIA ModelOpt
- title: Model accelerators and hardware
isExpanded: false
sections:
- local: using-diffusers/stable_diffusion_jax_how_to
title: JAX/Flax
- local: optimization/onnx
title: ONNX
- local: optimization/open_vino
+6
View File
@@ -20,6 +20,12 @@ All pipelines with [`VaeImageProcessor`] accept PIL Image, PyTorch tensor, or Nu
[[autodoc]] image_processor.VaeImageProcessor
## InpaintProcessor
The [`InpaintProcessor`] accepts `mask` and `image` inputs and process them together. Optionally, it can accept padding_mask_crop and apply mask overlay.
[[autodoc]] image_processor.InpaintProcessor
## VaeImageProcessorLDM3D
The [`VaeImageProcessorLDM3D`] accepts RGB and depth inputs and returns RGB and depth outputs.
@@ -44,15 +44,3 @@ model = AutoencoderKL.from_single_file(url)
## DecoderOutput
[[autodoc]] models.autoencoders.vae.DecoderOutput
## FlaxAutoencoderKL
[[autodoc]] FlaxAutoencoderKL
## FlaxAutoencoderKLOutput
[[autodoc]] models.vae_flax.FlaxAutoencoderKLOutput
## FlaxDecoderOutput
[[autodoc]] models.vae_flax.FlaxDecoderOutput
-8
View File
@@ -40,11 +40,3 @@ pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=contro
## ControlNetOutput
[[autodoc]] models.controlnets.controlnet.ControlNetOutput
## FlaxControlNetModel
[[autodoc]] FlaxControlNetModel
## FlaxControlNetOutput
[[autodoc]] models.controlnets.controlnet_flax.FlaxControlNetOutput
-4
View File
@@ -19,10 +19,6 @@ All models are built from the base [`ModelMixin`] class which is a [`torch.nn.Mo
## ModelMixin
[[autodoc]] ModelMixin
## FlaxModelMixin
[[autodoc]] FlaxModelMixin
## PushToHubMixin
[[autodoc]] utils.PushToHubMixin
-6
View File
@@ -23,9 +23,3 @@ The abstract from the paper is:
## UNet2DConditionOutput
[[autodoc]] models.unets.unet_2d_condition.UNet2DConditionOutput
## FlaxUNet2DConditionModel
[[autodoc]] models.unets.unet_2d_condition_flax.FlaxUNet2DConditionModel
## FlaxUNet2DConditionOutput
[[autodoc]] models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput
-4
View File
@@ -54,10 +54,6 @@ To check a specific pipeline or model output, refer to its corresponding API doc
[[autodoc]] pipelines.ImagePipelineOutput
## FlaxImagePipelineOutput
[[autodoc]] pipelines.pipeline_flax_utils.FlaxImagePipelineOutput
## AudioPipelineOutput
[[autodoc]] pipelines.AudioPipelineOutput
+1 -1
View File
@@ -50,7 +50,7 @@ from diffusers.utils import export_to_video
pipeline_quant_config = PipelineQuantizationConfig(
quant_backend="torchao",
quant_kwargs={"quant_type": "int8wo"},
components_to_quantize=["transformer"]
components_to_quantize="transformer"
)
# fp8 layerwise weight-casting
@@ -72,11 +72,3 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
## StableDiffusionPipelineOutput
[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
## FlaxStableDiffusionControlNetPipeline
[[autodoc]] FlaxStableDiffusionControlNetPipeline
- all
- __call__
## FlaxStableDiffusionControlNetPipelineOutput
[[autodoc]] pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput
@@ -54,7 +54,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_compute_dtype": torch.bfloat16
},
components_to_quantize=["transformer"]
components_to_quantize="transformer"
)
pipeline = HunyuanVideoPipeline.from_pretrained(
@@ -91,7 +91,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_compute_dtype": torch.bfloat16
},
components_to_quantize=["transformer"]
components_to_quantize="transformer"
)
pipeline = HunyuanVideoPipeline.from_pretrained(
@@ -139,7 +139,7 @@ export_to_video(video, "output.mp4", fps=15)
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_compute_dtype": torch.bfloat16
},
components_to_quantize=["transformer"]
components_to_quantize="transformer"
)
pipeline = HunyuanVideoPipeline.from_pretrained(
-4
View File
@@ -106,10 +106,6 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
[[autodoc]] pipelines.StableDiffusionMixin.disable_freeu
## FlaxDiffusionPipeline
[[autodoc]] pipelines.pipeline_flax_utils.FlaxDiffusionPipeline
## PushToHubMixin
[[autodoc]] utils.PushToHubMixin
@@ -120,6 +120,12 @@ The `guidance_scale` parameter in the pipeline is there to support future guidan
- all
- __call__
## QwenImageEditInpaintPipeline
[[autodoc]] QwenImageEditInpaintPipeline
- all
- __call__
## QwenImaggeControlNetPipeline
- all
- __call__
@@ -47,13 +47,3 @@ Make sure to check out the Stable Diffusion [Tips](overview#tips) section to lea
## StableDiffusionPipelineOutput
[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
## FlaxStableDiffusionImg2ImgPipeline
[[autodoc]] FlaxStableDiffusionImg2ImgPipeline
- all
- __call__
## FlaxStableDiffusionPipelineOutput
[[autodoc]] pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput
@@ -49,13 +49,3 @@ If you're interested in using one of the official checkpoints for a task, explor
## StableDiffusionPipelineOutput
[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
## FlaxStableDiffusionInpaintPipeline
[[autodoc]] FlaxStableDiffusionInpaintPipeline
- all
- __call__
## FlaxStableDiffusionPipelineOutput
[[autodoc]] pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput
@@ -51,13 +51,3 @@ If you're interested in using one of the official checkpoints for a task, explor
## StableDiffusionPipelineOutput
[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
## FlaxStableDiffusionPipeline
[[autodoc]] FlaxStableDiffusionPipeline
- all
- __call__
## FlaxStableDiffusionPipelineOutput
[[autodoc]] pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput
+2 -23
View File
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
# Installation
Diffusers is tested on Python 3.8+, PyTorch 1.4+, and Flax 0.4.1+. Follow the installation instructions for the deep learning library you're using, [PyTorch](https://pytorch.org/get-started/locally/) or [Flax](https://flax.readthedocs.io/en/latest/).
Diffusers is tested on Python 3.8+ and PyTorch 1.4+. Install [PyTorch](https://pytorch.org/get-started/locally/) according to your system and setup.
Create a [virtual environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) for easier management of separate projects and to avoid compatibility issues between dependencies. Use [uv](https://docs.astral.sh/uv/), a Rust-based Python package and project manager, to create a virtual environment and install Diffusers.
@@ -32,12 +32,6 @@ PyTorch only supports Python 3.8 - 3.11 on Windows.
uv pip install diffusers["torch"] transformers
```
Use the command below for Flax.
```bash
uv pip install diffusers["flax"] transformers
```
</hfoption>
<hfoption id="conda">
@@ -71,27 +65,12 @@ An editable install is recommended for development workflows or if you're using
Clone the repository and install Diffusers with the following commands.
<hfoptions id="editable">
<hfoption id="PyTorch">
```bash
git clone https://github.com/huggingface/diffusers.git
cd diffusers
uv pip install -e ".[torch]"
```
</hfoption>
<hfoption id="Flax">
```bash
git clone https://github.com/huggingface/diffusers.git
cd diffusers
uv pip install -e ".[flax]"
```
</hfoption>
</hfoptions>
> [!WARNING]
> You must keep the `diffusers` folder if you want to keep using the library with the editable install.
@@ -140,7 +119,7 @@ For more details about managing and cleaning the cache, take a look at the [Unde
## Telemetry logging
Diffusers gathers telemetry information during [`~DiffusionPipeline.from_pretrained`] requests.
The data gathered includes the Diffusers and PyTorch/Flax version, the requested model or pipeline class,
The data gathered includes the Diffusers and PyTorch version, the requested model or pipeline class,
and the path to a pretrained checkpoint if it is hosted on the Hub.
This usage data helps us debug issues and prioritize new features.
@@ -51,10 +51,10 @@ t2i_pipeline = t2i_blocks.init_pipeline(modular_repo_id, components_manager=comp
</hfoption>
</hfoptions>
Components are only loaded and registered when using [`~ModularPipeline.load_components`] or [`~ModularPipeline.load_default_components`]. The example below uses [`~ModularPipeline.load_default_components`] to create a second pipeline that reuses all the components from the first one, and assigns it to a different collection
Components are only loaded and registered when using [`~ModularPipeline.load_components`] or [`~ModularPipeline.load_components`]. The example below uses [`~ModularPipeline.load_components`] to create a second pipeline that reuses all the components from the first one, and assigns it to a different collection
```py
pipe.load_default_components()
pipe.load_components()
pipe2 = ModularPipeline.from_pretrained("YiYiXu/modular-demo-auto", components_manager=comp, collection="test2")
```
@@ -187,4 +187,4 @@ comp.enable_auto_cpu_offload(device="cuda")
All models begin on the CPU and [`ComponentsManager`] moves them to the appropriate device right before they're needed, and moves other models back to the CPU when GPU memory is low.
You can set your own rules for which models to offload first.
You can set your own rules for which models to offload first.
+3 -3
View File
@@ -75,13 +75,13 @@ Guiders that are already saved on the Hub with a `modular_model_index.json` file
}
```
The guider is only created after calling [`~ModularPipeline.load_default_components`] based on the loading specification in `modular_model_index.json`.
The guider is only created after calling [`~ModularPipeline.load_components`] based on the loading specification in `modular_model_index.json`.
```py
t2i_pipeline = t2i_blocks.init_pipeline("YiYiXu/modular-doc-guider")
# not created during init
assert t2i_pipeline.guider is None
t2i_pipeline.load_default_components()
t2i_pipeline.load_components()
# loaded as PAG guider
t2i_pipeline.guider
```
@@ -172,4 +172,4 @@ t2i_pipeline.push_to_hub("YiYiXu/modular-doc-guider")
```
</hfoption>
</hfoptions>
</hfoptions>
@@ -29,7 +29,7 @@ blocks = SequentialPipelineBlocks.from_blocks_dict(TEXT2IMAGE_BLOCKS)
modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
pipeline = blocks.init_pipeline(modular_repo_id)
pipeline.load_default_components(torch_dtype=torch.float16)
pipeline.load_components(torch_dtype=torch.float16)
pipeline.to("cuda")
image = pipeline(prompt="Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", output="images")[0]
@@ -49,7 +49,7 @@ blocks = SequentialPipelineBlocks.from_blocks_dict(IMAGE2IMAGE_BLOCKS)
modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
pipeline = blocks.init_pipeline(modular_repo_id)
pipeline.load_default_components(torch_dtype=torch.float16)
pipeline.load_components(torch_dtype=torch.float16)
pipeline.to("cuda")
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-text2img.png"
@@ -73,7 +73,7 @@ blocks = SequentialPipelineBlocks.from_blocks_dict(INPAINT_BLOCKS)
modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
pipeline = blocks.init_pipeline(modular_repo_id)
pipeline.load_default_components(torch_dtype=torch.float16)
pipeline.load_components(torch_dtype=torch.float16)
pipeline.to("cuda")
img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-text2img.png"
@@ -176,15 +176,15 @@ diffdiff_pipeline = ModularPipeline.from_pretrained(modular_repo_id, trust_remot
## Loading components
A [`ModularPipeline`] doesn't automatically instantiate with components. It only loads the configuration and component specifications. You can load all components with [`~ModularPipeline.load_default_components`] or only load specific components with [`~ModularPipeline.load_components`].
A [`ModularPipeline`] doesn't automatically instantiate with components. It only loads the configuration and component specifications. You can load all components with [`~ModularPipeline.load_components`] or only load specific components with [`~ModularPipeline.load_components`].
<hfoptions id="load">
<hfoption id="load_default_components">
<hfoption id="load_components">
```py
import torch
t2i_pipeline.load_default_components(torch_dtype=torch.float16)
t2i_pipeline.load_components(torch_dtype=torch.float16)
t2i_pipeline.to("cuda")
```
@@ -355,4 +355,4 @@ The [config.json](https://huggingface.co/YiYiXu/modular-diffdiff-0704/blob/main/
"ModularPipelineBlocks": "block.DiffDiffBlocks"
}
}
```
```
@@ -173,9 +173,9 @@ print(dd_blocks)
## ModularPipeline
Convert the [`SequentialPipelineBlocks`] into a [`ModularPipeline`] with the [`ModularPipeline.init_pipeline`] method. This initializes the expected components to load from a `modular_model_index.json` file. Explicitly load the components by calling [`ModularPipeline.load_default_components`].
Convert the [`SequentialPipelineBlocks`] into a [`ModularPipeline`] with the [`ModularPipeline.init_pipeline`] method. This initializes the expected components to load from a `modular_model_index.json` file. Explicitly load the components by calling [`ModularPipeline.load_components`].
It is a good idea to initialize the [`ComponentManager`] with the pipeline to help manage the different components. Once you call [`~ModularPipeline.load_default_components`], the components are registered to the [`ComponentManager`] and can be shared between workflows. The example below uses the `collection` argument to assign the components a `"diffdiff"` label for better organization.
It is a good idea to initialize the [`ComponentManager`] with the pipeline to help manage the different components. Once you call [`~ModularPipeline.load_components`], the components are registered to the [`ComponentManager`] and can be shared between workflows. The example below uses the `collection` argument to assign the components a `"diffdiff"` label for better organization.
```py
from diffusers.modular_pipelines import ComponentsManager
@@ -209,11 +209,11 @@ Use the [`sub_blocks.insert`] method to insert it into the [`ModularPipeline`].
dd_blocks.sub_blocks.insert("ip_adapter", ip_adapter_block, 0)
```
Call [`~ModularPipeline.init_pipeline`] to initialize a [`ModularPipeline`] and use [`~ModularPipeline.load_default_components`] to load the model components. Load and set the IP-Adapter to run the pipeline.
Call [`~ModularPipeline.init_pipeline`] to initialize a [`ModularPipeline`] and use [`~ModularPipeline.load_components`] to load the model components. Load and set the IP-Adapter to run the pipeline.
```py
dd_pipeline = dd_blocks.init_pipeline("YiYiXu/modular-demo-auto", collection="diffdiff")
dd_pipeline.load_default_components(torch_dtype=torch.float16)
dd_pipeline.load_components(torch_dtype=torch.float16)
dd_pipeline.loader.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
dd_pipeline.loader.set_ip_adapter_scale(0.6)
dd_pipeline = dd_pipeline.to(device)
@@ -260,14 +260,14 @@ class SDXLDiffDiffControlNetDenoiseStep(StableDiffusionXLDenoiseLoopWrapper):
controlnet_denoise_block = SDXLDiffDiffControlNetDenoiseStep()
```
Insert the `controlnet_input` block and replace the `denoise` block with the new `controlnet_denoise_block`. Initialize a [`ModularPipeline`] and [`~ModularPipeline.load_default_components`] into it.
Insert the `controlnet_input` block and replace the `denoise` block with the new `controlnet_denoise_block`. Initialize a [`ModularPipeline`] and [`~ModularPipeline.load_components`] into it.
```py
dd_blocks.sub_blocks.insert("controlnet_input", control_input_block, 7)
dd_blocks.sub_blocks["denoise"] = controlnet_denoise_block
dd_pipeline = dd_blocks.init_pipeline("YiYiXu/modular-demo-auto", collection="diffdiff")
dd_pipeline.load_default_components(torch_dtype=torch.float16)
dd_pipeline.load_components(torch_dtype=torch.float16)
dd_pipeline = dd_pipeline.to(device)
control_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/diffdiff_tomato_canny.jpeg")
@@ -320,7 +320,7 @@ Call [`SequentialPipelineBlocks.from_blocks_dict`] to create a [`SequentialPipel
```py
dd_auto_blocks = SequentialPipelineBlocks.from_blocks_dict(DIFFDIFF_AUTO_BLOCKS)
dd_pipeline = dd_auto_blocks.init_pipeline("YiYiXu/modular-demo-auto", collection="diffdiff")
dd_pipeline.load_default_components(torch_dtype=torch.float16)
dd_pipeline.load_components(torch_dtype=torch.float16)
```
## Share
@@ -340,5 +340,5 @@ from diffusers.modular_pipelines import ModularPipeline, ComponentsManager
components = ComponentsManager()
diffdiff_pipeline = ModularPipeline.from_pretrained("YiYiXu/modular-diffdiff-0704", trust_remote_code=True, components_manager=components, collection="diffdiff")
diffdiff_pipeline.load_default_components(torch_dtype=torch.float16)
```
diffdiff_pipeline.load_components(torch_dtype=torch.float16)
```
+46 -3
View File
@@ -291,13 +291,53 @@ Group offloading moves groups of internal layers ([torch.nn.ModuleList](https://
> [!WARNING]
> Group offloading may not work with all models if the forward implementation contains weight-dependent device casting of inputs because it may clash with group offloading's device casting mechanism.
Call [`~ModelMixin.enable_group_offload`] to enable it for standard Diffusers model components that inherit from [`ModelMixin`]. For other model components that don't inherit from [`ModelMixin`], such as a generic [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), use [`~hooks.apply_group_offloading`] instead.
The `offload_type` parameter can be set to `block_level` or `leaf_level`.
Enable group offloading by configuring the `offload_type` parameter to `block_level` or `leaf_level`.
- `block_level` offloads groups of layers based on the `num_blocks_per_group` parameter. For example, if `num_blocks_per_group=2` on a model with 40 layers, 2 layers are onloaded and offloaded at a time (20 total onloads/offloads). This drastically reduces memory requirements.
- `leaf_level` offloads individual layers at the lowest level and is equivalent to [CPU offloading](#cpu-offloading). But it can be made faster if you use streams without giving up inference speed.
Group offloading is supported for entire pipelines or individual models. Applying group offloading to the entire pipeline is the easiest option while selectively applying it to individual models gives users more flexibility to use different offloading techniques for different models.
<hfoptions id="group-offloading">
<hfoption id="pipeline">
Call [`~DiffusionPipeline.enable_group_offload`] on a pipeline.
```py
import torch
from diffusers import CogVideoXPipeline
from diffusers.hooks import apply_group_offloading
from diffusers.utils import export_to_video
onload_device = torch.device("cuda")
offload_device = torch.device("cpu")
pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
pipeline.enable_group_offload(
onload_device=onload_device,
offload_device=offload_device,
offload_type="leaf_level",
use_stream=True
)
prompt = (
"A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
"The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
"pandas gather, watching curiously and some clapping in rhythm. Sunlight filters through the tall bamboo, "
"casting a gentle glow on the scene. The panda's face is expressive, showing concentration and joy as it plays. "
"The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
"atmosphere of this unique musical performance."
)
video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
export_to_video(video, "output.mp4", fps=8)
```
</hfoption>
<hfoption id="model">
Call [`~ModelMixin.enable_group_offload`] on standard Diffusers model components that inherit from [`ModelMixin`]. For other model components that don't inherit from [`ModelMixin`], such as a generic [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), use [`~hooks.apply_group_offloading`] instead.
```py
import torch
from diffusers import CogVideoXPipeline
@@ -328,6 +368,9 @@ print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} G
export_to_video(video, "output.mp4", fps=8)
```
</hfoption>
</hfoptions>
#### CUDA stream
The `use_stream` parameter can be activated for CUDA devices that support asynchronous data transfer streams to reduce overall execution time compared to [CPU offloading](#cpu-offloading). It overlaps data transfer and computation by using layer prefetching. The next layer to be executed is loaded onto the GPU while the current layer is still being executed. It can increase CPU memory significantly so ensure you have 2x the amount of memory as the model size.
+141
View File
@@ -0,0 +1,141 @@
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License. -->
# NVIDIA ModelOpt
[NVIDIA-ModelOpt](https://github.com/NVIDIA/TensorRT-Model-Optimizer) is a unified library of state-of-the-art model optimization techniques like quantization, pruning, distillation, speculative decoding, etc. It compresses deep learning models for downstream deployment frameworks like TensorRT-LLM or TensorRT to optimize inference speed.
Before you begin, make sure you have nvidia_modelopt installed.
```bash
pip install -U "nvidia_modelopt[hf]"
```
Quantize a model by passing [`NVIDIAModelOptConfig`] to [`~ModelMixin.from_pretrained`] (you can also load pre-quantized models). This works for any model in any modality, as long as it supports loading with [Accelerate](https://hf.co/docs/accelerate/index) and contains `torch.nn.Linear` layers.
The example below only quantizes the weights to FP8.
```python
import torch
from diffusers import AutoModel, SanaPipeline, NVIDIAModelOptConfig
model_id = "Efficient-Large-Model/Sana_600M_1024px_diffusers"
dtype = torch.bfloat16
quantization_config = NVIDIAModelOptConfig(quant_type="FP8", quant_method="modelopt")
transformer = AutoModel.from_pretrained(
model_id,
subfolder="transformer",
quantization_config=quantization_config,
torch_dtype=dtype,
)
pipe = SanaPipeline.from_pretrained(
model_id,
transformer=transformer,
torch_dtype=dtype,
)
pipe.to("cuda")
print(f"Pipeline memory usage: {torch.cuda.max_memory_reserved() / 1024**3:.3f} GB")
prompt = "A cat holding a sign that says hello world"
image = pipe(
prompt, num_inference_steps=50, guidance_scale=4.5, max_sequence_length=512
).images[0]
image.save("output.png")
```
> **Note:**
>
> The quantization methods in NVIDIA-ModelOpt are designed to reduce the memory footprint of model weights using various QAT (Quantization-Aware Training) and PTQ (Post-Training Quantization) techniques while maintaining model performance. However, the actual performance gain during inference depends on the deployment framework (e.g., TRT-LLM, TensorRT) and the specific hardware configuration.
>
> More details can be found [here](https://github.com/NVIDIA/TensorRT-Model-Optimizer/tree/main/examples).
## NVIDIAModelOptConfig
The `NVIDIAModelOptConfig` class accepts three parameters:
- `quant_type`: A string value mentioning one of the quantization types below.
- `modules_to_not_convert`: A list of module full/partial module names for which quantization should not be performed. For example, to not perform any quantization of the [`SD3Transformer2DModel`]'s pos_embed projection blocks, one would specify: `modules_to_not_convert=["pos_embed.proj.weight"]`.
- `disable_conv_quantization`: A boolean value which when set to `True` disables quantization for all convolutional layers in the model. This is useful as channel and block quantization generally don't work well with convolutional layers (used with INT4, NF4, NVFP4). If you want to disable quantization for specific convolutional layers, use `modules_to_not_convert` instead.
- `algorithm`: The algorithm to use for determining scale, defaults to `"max"`. You can check modelopt documentation for more algorithms and details.
- `forward_loop`: The forward loop function to use for calibrating activation during quantization. If not provided, it relies on static scale values computed using the weights only.
- `kwargs`: A dict of keyword arguments to pass to the underlying quantization method which will be invoked based on `quant_type`.
## Supported quantization types
ModelOpt supports weight-only, channel and block quantization int8, fp8, int4, nf4, and nvfp4. The quantization methods are designed to reduce the memory footprint of the model weights while maintaining the performance of the model during inference.
Weight-only quantization stores the model weights in a specific low-bit data type but performs computation with a higher-precision data type, like `bfloat16`. This lowers the memory requirements from model weights but retains the memory peaks for activation computation.
The quantization methods supported are as follows:
| **Quantization Type** | **Supported Schemes** | **Required Kwargs** | **Additional Notes** |
|-----------------------|-----------------------|---------------------|----------------------|
| **INT8** | `int8 weight only`, `int8 channel quantization`, `int8 block quantization` | `quant_type`, `quant_type + channel_quantize`, `quant_type + channel_quantize + block_quantize` |
| **FP8** | `fp8 weight only`, `fp8 channel quantization`, `fp8 block quantization` | `quant_type`, `quant_type + channel_quantize`, `quant_type + channel_quantize + block_quantize` |
| **INT4** | `int4 weight only`, `int4 block quantization` | `quant_type`, `quant_type + channel_quantize + block_quantize` | `channel_quantize = -1 is only supported for now`|
| **NF4** | `nf4 weight only`, `nf4 double block quantization` | `quant_type`, `quant_type + channel_quantize + block_quantize + scale_channel_quantize` + `scale_block_quantize` | `channel_quantize = -1 and scale_channel_quantize = -1 are only supported for now` |
| **NVFP4** | `nvfp4 weight only`, `nvfp4 block quantization` | `quant_type`, `quant_type + channel_quantize + block_quantize` | `channel_quantize = -1 is only supported for now`|
Refer to the [official modelopt documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/) for a better understanding of the available quantization methods and the exhaustive list of configuration options available.
## Serializing and Deserializing quantized models
To serialize a quantized model in a given dtype, first load the model with the desired quantization dtype and then save it using the [`~ModelMixin.save_pretrained`] method.
```python
import torch
from diffusers import AutoModel, NVIDIAModelOptConfig
from modelopt.torch.opt import enable_huggingface_checkpointing
enable_huggingface_checkpointing()
model_id = "Efficient-Large-Model/Sana_600M_1024px_diffusers"
quant_config_fp8 = {"quant_type": "FP8", "quant_method": "modelopt"}
quant_config_fp8 = NVIDIAModelOptConfig(**quant_config_fp8)
model = AutoModel.from_pretrained(
model_id,
subfolder="transformer",
quantization_config=quant_config_fp8,
torch_dtype=torch.bfloat16,
)
model.save_pretrained('path/to/sana_fp8', safe_serialization=False)
```
To load a serialized quantized model, use the [`~ModelMixin.from_pretrained`] method.
```python
import torch
from diffusers import AutoModel, NVIDIAModelOptConfig, SanaPipeline
from modelopt.torch.opt import enable_huggingface_checkpointing
enable_huggingface_checkpointing()
quantization_config = NVIDIAModelOptConfig(quant_type="FP8", quant_method="modelopt")
transformer = AutoModel.from_pretrained(
"path/to/sana_fp8",
subfolder="transformer",
quantization_config=quantization_config,
torch_dtype=torch.bfloat16,
)
pipe = SanaPipeline.from_pretrained(
"Efficient-Large-Model/Sana_600M_1024px_diffusers",
transformer=transformer,
torch_dtype=torch.bfloat16,
)
pipe.to("cuda")
prompt = "A cat holding a sign that says hello world"
image = pipe(
prompt, num_inference_steps=50, guidance_scale=4.5, max_sequence_length=512
).images[0]
image.save("output.png")
```
+4 -1
View File
@@ -34,7 +34,9 @@ Initialize [`~quantizers.PipelineQuantizationConfig`] with the following paramet
> [!TIP]
> These `quant_kwargs` arguments are different for each backend. Refer to the [Quantization API](../api/quantization) docs to view the arguments for each backend.
- `components_to_quantize` specifies which components of the pipeline to quantize. Typically, you should quantize the most compute intensive components like the transformer. The text encoder is another component to consider quantizing if a pipeline has more than one such as [`FluxPipeline`]. The example below quantizes the T5 text encoder in [`FluxPipeline`] while keeping the CLIP model intact.
- `components_to_quantize` specifies which component(s) of the pipeline to quantize. Typically, you should quantize the most compute intensive components like the transformer. The text encoder is another component to consider quantizing if a pipeline has more than one such as [`FluxPipeline`]. The example below quantizes the T5 text encoder in [`FluxPipeline`] while keeping the CLIP model intact.
`components_to_quantize` accepts either a list for multiple models or a string for a single model.
The example below loads the bitsandbytes backend with the following arguments from [`~quantizers.quantization_config.BitsAndBytesConfig`], `load_in_4bit`, `bnb_4bit_quant_type`, and `bnb_4bit_compute_dtype`.
@@ -62,6 +64,7 @@ pipe = DiffusionPipeline.from_pretrained(
image = pipe("photo of a cute dog").images[0]
```
### Advanced quantization
The `quant_mapping` argument provides more options for how to quantize each individual component in a pipeline, like combining different quantization backends.
+2 -81
View File
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
[ControlNet](https://hf.co/papers/2302.05543) models are adapters trained on top of another pretrained model. It allows for a greater degree of control over image generation by conditioning the model with an additional input image. The input image can be a canny edge, depth map, human pose, and many more.
If you're training on a GPU with limited vRAM, you should try enabling the `gradient_checkpointing`, `gradient_accumulation_steps`, and `mixed_precision` parameters in the training command. You can also reduce your memory footprint by using memory-efficient attention with [xFormers](../optimization/xformers). JAX/Flax training is also supported for efficient training on TPUs and GPUs, but it doesn't support gradient checkpointing or xFormers. You should have a GPU with >30GB of memory if you want to train faster with Flax.
If you're training on a GPU with limited vRAM, you should try enabling the `gradient_checkpointing`, `gradient_accumulation_steps`, and `mixed_precision` parameters in the training command. You can also reduce your memory footprint by using memory-efficient attention with [xFormers](../optimization/xformers).
This guide will explore the [train_controlnet.py](https://github.com/huggingface/diffusers/blob/main/examples/controlnet/train_controlnet.py) training script to help you become familiar with it, and how you can adapt it for your own use-case.
@@ -28,45 +28,10 @@ pip install .
Then navigate to the example folder containing the training script and install the required dependencies for the script you're using:
<hfoptions id="installation">
<hfoption id="PyTorch">
```bash
cd examples/controlnet
pip install -r requirements.txt
```
</hfoption>
<hfoption id="Flax">
If you have access to a TPU, the Flax training script runs even faster! Let's run the training script on the [Google Cloud TPU VM](https://cloud.google.com/tpu/docs/run-calculation-jax). Create a single TPU v4-8 VM and connect to it:
```bash
ZONE=us-central2-b
TPU_TYPE=v4-8
VM_NAME=hg_flax
gcloud alpha compute tpus tpu-vm create $VM_NAME \
--zone $ZONE \
--accelerator-type $TPU_TYPE \
--version tpu-vm-v4-base
gcloud alpha compute tpus tpu-vm ssh $VM_NAME --zone $ZONE -- \
```
Install JAX 0.4.5:
```bash
pip install "jax[tpu]==0.4.5" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
```
Then install the required dependencies for the Flax script:
```bash
cd examples/controlnet
pip install -r requirements_flax.txt
```
</hfoption>
</hfoptions>
<Tip>
@@ -120,7 +85,7 @@ Many of the basic and important parameters are described in the [Text-to-image](
### Min-SNR weighting
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch and is unavailable in the Flax training script.
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch.
Add the `--snr_gamma` parameter and set it to the recommended value of 5.0:
@@ -272,9 +237,6 @@ That's it! You don't need to add any additional parameters to your training comm
</hfoption>
</hfoptions>
<hfoptions id="training-inference">
<hfoption id="PyTorch">
```bash
export MODEL_DIR="stable-diffusion-v1-5/stable-diffusion-v1-5"
export OUTPUT_DIR="path/to/save/model"
@@ -292,47 +254,6 @@ accelerate launch train_controlnet.py \
--push_to_hub
```
</hfoption>
<hfoption id="Flax">
With Flax, you can [profile your code](https://jax.readthedocs.io/en/latest/profiling.html) by adding the `--profile_steps==5` parameter to your training command. Install the Tensorboard profile plugin:
```bash
pip install tensorflow tensorboard-plugin-profile
tensorboard --logdir runs/fill-circle-100steps-20230411_165612/
```
Then you can inspect the profile at [http://localhost:6006/#profile](http://localhost:6006/#profile).
<Tip warning={true}>
If you run into version conflicts with the plugin, try uninstalling and reinstalling all versions of TensorFlow and Tensorboard. The debugging functionality of the profile plugin is still experimental, and not all views are fully functional. The `trace_viewer` cuts off events after 1M, which can result in all your device traces getting lost if for example, you profile the compilation step by accident.
</Tip>
```bash
python3 train_controlnet_flax.py \
--pretrained_model_name_or_path=$MODEL_DIR \
--output_dir=$OUTPUT_DIR \
--dataset_name=fusing/fill50k \
--resolution=512 \
--learning_rate=1e-5 \
--validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
--validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
--validation_steps=1000 \
--train_batch_size=2 \
--revision="non-ema" \
--from_pt \
--report_to="wandb" \
--tracker_project_name=$HUB_MODEL_ID \
--num_train_epochs=11 \
--push_to_hub \
--hub_model_id=$HUB_MODEL_ID
```
</hfoption>
</hfoptions>
Once training is complete, you can use your newly trained model for inference!
```py
@@ -223,7 +223,7 @@ from diffusers.image_processor import VaeImageProcessor
import torch
vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=torch.bfloat16).to("cuda")
vae_scale_factor = 2 ** (len(vae.config.block_out_channels))
vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)
with torch.no_grad():
+2 -78
View File
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
[DreamBooth](https://huggingface.co/papers/2208.12242) is a training technique that updates the entire diffusion model by training on just a few images of a subject or style. It works by associating a special word in the prompt with the example images.
If you're training on a GPU with limited vRAM, you should try enabling the `gradient_checkpointing` and `mixed_precision` parameters in the training command. You can also reduce your memory footprint by using memory-efficient attention with [xFormers](../optimization/xformers). JAX/Flax training is also supported for efficient training on TPUs and GPUs, but it doesn't support gradient checkpointing or xFormers. You should have a GPU with >30GB of memory if you want to train faster with Flax.
If you're training on a GPU with limited vRAM, you should try enabling the `gradient_checkpointing` and `mixed_precision` parameters in the training command. You can also reduce your memory footprint by using memory-efficient attention with [xFormers](../optimization/xformers).
This guide will explore the [train_dreambooth.py](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py) script to help you become more familiar with it, and how you can adapt it for your own use-case.
@@ -28,25 +28,11 @@ pip install .
Navigate to the example folder with the training script and install the required dependencies for the script you're using:
<hfoptions id="installation">
<hfoption id="PyTorch">
```bash
cd examples/dreambooth
pip install -r requirements.txt
```
</hfoption>
<hfoption id="Flax">
```bash
cd examples/dreambooth
pip install -r requirements_flax.txt
```
</hfoption>
</hfoptions>
<Tip>
🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
@@ -110,7 +96,7 @@ Some basic and important parameters to know and specify are:
### Min-SNR weighting
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch and is unavailable in the Flax training script.
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch.
Add the `--snr_gamma` parameter and set it to the recommended value of 5.0:
@@ -311,9 +297,6 @@ That's it! You don't need to add any additional parameters to your training comm
</hfoption>
</hfoptions>
<hfoptions id="training-inference">
<hfoption id="PyTorch">
```bash
export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
export INSTANCE_DIR="./dog"
@@ -334,29 +317,6 @@ accelerate launch train_dreambooth.py \
--push_to_hub
```
</hfoption>
<hfoption id="Flax">
```bash
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
export INSTANCE_DIR="./dog"
export OUTPUT_DIR="path-to-save-model"
python train_dreambooth_flax.py \
--pretrained_model_name_or_path=$MODEL_NAME \
--instance_data_dir=$INSTANCE_DIR \
--output_dir=$OUTPUT_DIR \
--instance_prompt="a photo of sks dog" \
--resolution=512 \
--train_batch_size=1 \
--learning_rate=5e-6 \
--max_train_steps=400 \
--push_to_hub
```
</hfoption>
</hfoptions>
Once training is complete, you can use your newly trained model for inference!
<Tip>
@@ -383,9 +343,6 @@ image.save("dog-bucket.png")
</Tip>
<hfoptions id="training-inference">
<hfoption id="PyTorch">
```py
from diffusers import DiffusionPipeline
import torch
@@ -395,39 +352,6 @@ image = pipeline("A photo of sks dog in a bucket", num_inference_steps=50, guida
image.save("dog-bucket.png")
```
</hfoption>
<hfoption id="Flax">
```py
import jax
import numpy as np
from flax.jax_utils import replicate
from flax.training.common_utils import shard
from diffusers import FlaxStableDiffusionPipeline
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained("path-to-your-trained-model", dtype=jax.numpy.bfloat16)
prompt = "A photo of sks dog in a bucket"
prng_seed = jax.random.PRNGKey(0)
num_inference_steps = 50
num_samples = jax.device_count()
prompt = num_samples * [prompt]
prompt_ids = pipeline.prepare_inputs(prompt)
# shard inputs and rng
params = replicate(params)
prng_seed = jax.random.split(prng_seed, jax.device_count())
prompt_ids = shard(prompt_ids)
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
image.save("dog-bucket.png")
```
</hfoption>
</hfoptions>
## LoRA
LoRA is a training technique for significantly reducing the number of trainable parameters. As a result, training is faster and it is easier to store the resulting weights because they are a lot smaller (~100MBs). Use the [train_dreambooth_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora.py) script to train with LoRA.
+1 -1
View File
@@ -88,7 +88,7 @@ Most of the parameters are identical to the parameters in the [Text-to-image](te
### Min-SNR weighting
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch and is unavailable in the Flax training script.
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch.
Add the `--snr_gamma` parameter and set it to the recommended value of 5.0:
-14
View File
@@ -38,25 +38,11 @@ pip install .
Navigate to the example folder with the training script and install the required dependencies for the script you're using:
<hfoptions id="installation">
<hfoption id="PyTorch">
```bash
cd examples/text_to_image
pip install -r requirements.txt
```
</hfoption>
<hfoption id="Flax">
```bash
cd examples/text_to_image
pip install -r requirements_flax.txt
```
</hfoption>
</hfoptions>
<Tip>
🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
+13 -13
View File
@@ -23,18 +23,18 @@ Each training script is:
Our current collection of training scripts include:
| Training | SDXL-support | LoRA-support | Flax-support |
|---|---|---|---|
| [unconditional image generation](https://github.com/huggingface/diffusers/tree/main/examples/unconditional_image_generation) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) | | | |
| [text-to-image](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image) | 👍 | 👍 | 👍 |
| [textual inversion](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb) | | | 👍 |
| [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb) | 👍 | 👍 | 👍 |
| [ControlNet](https://github.com/huggingface/diffusers/tree/main/examples/controlnet) | 👍 | | 👍 |
| [InstructPix2Pix](https://github.com/huggingface/diffusers/tree/main/examples/instruct_pix2pix) | 👍 | | |
| [Custom Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/custom_diffusion) | | | |
| [T2I-Adapters](https://github.com/huggingface/diffusers/tree/main/examples/t2i_adapter) | 👍 | | |
| [Kandinsky 2.2](https://github.com/huggingface/diffusers/tree/main/examples/kandinsky2_2/text_to_image) | | 👍 | |
| [Wuerstchen](https://github.com/huggingface/diffusers/tree/main/examples/wuerstchen/text_to_image) | | 👍 | |
| Training | SDXL-support | LoRA-support |
|---|---|---|
| [unconditional image generation](https://github.com/huggingface/diffusers/tree/main/examples/unconditional_image_generation) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) | | |
| [text-to-image](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image) | 👍 | 👍 |
| [textual inversion](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb) | | |
| [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb) | 👍 | 👍 |
| [ControlNet](https://github.com/huggingface/diffusers/tree/main/examples/controlnet) | 👍 | |
| [InstructPix2Pix](https://github.com/huggingface/diffusers/tree/main/examples/instruct_pix2pix) | 👍 | |
| [Custom Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/custom_diffusion) | | |
| [T2I-Adapters](https://github.com/huggingface/diffusers/tree/main/examples/t2i_adapter) | 👍 | |
| [Kandinsky 2.2](https://github.com/huggingface/diffusers/tree/main/examples/kandinsky2_2/text_to_image) | | 👍 |
| [Wuerstchen](https://github.com/huggingface/diffusers/tree/main/examples/wuerstchen/text_to_image) | | 👍 |
These examples are **actively** maintained, so please feel free to open an issue if they aren't working as expected. If you feel like another training example should be included, you're more than welcome to start a [Feature Request](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feature_request.md&title=) to discuss your feature idea with us and whether it meets our criteria of being self-contained, easy-to-tweak, beginner-friendly, and single-purpose.
@@ -48,7 +48,7 @@ cd diffusers
pip install .
```
Then navigate to the folder of the training script (for example, [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth)) and install the `requirements.txt` file. Some training scripts have a specific requirement file for SDXL, LoRA or Flax. If you're using one of these scripts, make sure you install its corresponding requirements file.
Then navigate to the folder of the training script (for example, [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth)) and install the `requirements.txt` file. Some training scripts have a specific requirement file for SDXL or LoRA. If you're using one of these scripts, make sure you install its corresponding requirements file.
```bash
cd examples/dreambooth
+1 -1
View File
@@ -96,7 +96,7 @@ Most of the parameters are identical to the parameters in the [Text-to-image](te
### Min-SNR weighting
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting either `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch and is unavailable in the Flax training script.
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting either `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch.
Add the `--snr_gamma` parameter and set it to the recommended value of 5.0:
+2 -83
View File
@@ -20,7 +20,7 @@ The text-to-image script is experimental, and it's easy to overfit and run into
Text-to-image models like Stable Diffusion are conditioned to generate images given a text prompt.
Training a model can be taxing on your hardware, but if you enable `gradient_checkpointing` and `mixed_precision`, it is possible to train a model on a single 24GB GPU. If you're training with larger batch sizes or want to train faster, it's better to use GPUs with more than 30GB of memory. You can reduce your memory footprint by enabling memory-efficient attention with [xFormers](../optimization/xformers). JAX/Flax training is also supported for efficient training on TPUs and GPUs, but it doesn't support gradient checkpointing, gradient accumulation or xFormers. A GPU with at least 30GB of memory or a TPU v3 is recommended for training with Flax.
Training a model can be taxing on your hardware, but if you enable `gradient_checkpointing` and `mixed_precision`, it is possible to train a model on a single 24GB GPU. If you're training with larger batch sizes or want to train faster, it's better to use GPUs with more than 30GB of memory. You can reduce your memory footprint by enabling memory-efficient attention with [xFormers](../optimization/xformers).
This guide will explore the [train_text_to_image.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) training script to help you become familiar with it, and how you can adapt it for your own use-case.
@@ -34,20 +34,10 @@ pip install .
Then navigate to the example folder containing the training script and install the required dependencies for the script you're using:
<hfoptions id="installation">
<hfoption id="PyTorch">
```bash
cd examples/text_to_image
pip install -r requirements.txt
```
</hfoption>
<hfoption id="Flax">
```bash
cd examples/text_to_image
pip install -r requirements_flax.txt
```
</hfoption>
</hfoptions>
<Tip>
@@ -106,7 +96,7 @@ Some basic and important parameters include:
### Min-SNR weighting
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch and is unavailable in the Flax training script.
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch.
Add the `--snr_gamma` parameter and set it to the recommended value of 5.0:
@@ -155,9 +145,6 @@ Lastly, the [training loop](https://github.com/huggingface/diffusers/blob/8959c5
Once you've made all your changes or you're okay with the default configuration, you're ready to launch the training script! 🚀
<hfoptions id="training-inference">
<hfoption id="PyTorch">
Let's train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `dataset_name` to the model and the dataset (either from the Hub or a local path). If you're training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.
<Tip>
@@ -187,43 +174,8 @@ accelerate launch --mixed_precision="fp16" train_text_to_image.py \
--push_to_hub
```
</hfoption>
<hfoption id="Flax">
Training with Flax can be faster on TPUs and GPUs thanks to [@duongna211](https://github.com/duongna21). Flax is more efficient on a TPU, but GPU performance is also great.
Set the environment variables `MODEL_NAME` and `dataset_name` to the model and the dataset (either from the Hub or a local path).
<Tip>
To train on a local dataset, set the `TRAIN_DIR` and `OUTPUT_DIR` environment variables to the path of the dataset and where to save the model to.
</Tip>
```bash
export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
export dataset_name="lambdalabs/naruto-blip-captions"
python train_text_to_image_flax.py \
--pretrained_model_name_or_path=$MODEL_NAME \
--dataset_name=$dataset_name \
--resolution=512 --center_crop --random_flip \
--train_batch_size=1 \
--max_train_steps=15000 \
--learning_rate=1e-05 \
--max_grad_norm=1 \
--output_dir="sd-naruto-model" \
--push_to_hub
```
</hfoption>
</hfoptions>
Once training is complete, you can use your newly trained model for inference:
<hfoptions id="training-inference">
<hfoption id="PyTorch">
```py
from diffusers import StableDiffusionPipeline
import torch
@@ -234,39 +186,6 @@ image = pipeline(prompt="yoda").images[0]
image.save("yoda-naruto.png")
```
</hfoption>
<hfoption id="Flax">
```py
import jax
import numpy as np
from flax.jax_utils import replicate
from flax.training.common_utils import shard
from diffusers import FlaxStableDiffusionPipeline
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained("path/to/saved_model", dtype=jax.numpy.bfloat16)
prompt = "yoda naruto"
prng_seed = jax.random.PRNGKey(0)
num_inference_steps = 50
num_samples = jax.device_count()
prompt = num_samples * [prompt]
prompt_ids = pipeline.prepare_inputs(prompt)
# shard inputs and rng
params = replicate(params)
prng_seed = jax.random.split(prng_seed, jax.device_count())
prompt_ids = shard(prompt_ids)
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
image.save("yoda-naruto.png")
```
</hfoption>
</hfoptions>
## Next steps
Congratulations on training your own text-to-image model! To learn more about how to use your new model, the following guides may be helpful:
+1 -83
View File
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
[Textual Inversion](https://hf.co/papers/2208.01618) is a training technique for personalizing image generation models with just a few example images of what you want it to learn. This technique works by learning and updating the text embeddings (the new embeddings are tied to a special word you must use in the prompt) to match the example images you provide.
If you're training on a GPU with limited vRAM, you should try enabling the `gradient_checkpointing` and `mixed_precision` parameters in the training command. You can also reduce your memory footprint by using memory-efficient attention with [xFormers](../optimization/xformers). JAX/Flax training is also supported for efficient training on TPUs and GPUs, but it doesn't support gradient checkpointing or xFormers. With the same configuration and setup as PyTorch, the Flax training script should be at least ~70% faster!
If you're training on a GPU with limited vRAM, you should try enabling the `gradient_checkpointing` and `mixed_precision` parameters in the training command. You can also reduce your memory footprint by using memory-efficient attention with [xFormers](../optimization/xformers).
This guide will explore the [textual_inversion.py](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py) script to help you become more familiar with it, and how you can adapt it for your own use-case.
@@ -28,25 +28,10 @@ pip install .
Navigate to the example folder with the training script and install the required dependencies for the script you're using:
<hfoptions id="installation">
<hfoption id="PyTorch">
```bash
cd examples/textual_inversion
pip install -r requirements.txt
```
</hfoption>
<hfoption id="Flax">
```bash
cd examples/textual_inversion
pip install -r requirements_flax.txt
```
</hfoption>
</hfoptions>
<Tip>
🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
@@ -189,9 +174,6 @@ One more thing before you launch the script. If you're interested in following a
--validation_steps=100
```
<hfoptions id="training-inference">
<hfoption id="PyTorch">
```bash
export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
export DATA_DIR="./cat"
@@ -214,36 +196,8 @@ accelerate launch textual_inversion.py \
--push_to_hub
```
</hfoption>
<hfoption id="Flax">
```bash
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
export DATA_DIR="./cat"
python textual_inversion_flax.py \
--pretrained_model_name_or_path=$MODEL_NAME \
--train_data_dir=$DATA_DIR \
--learnable_property="object" \
--placeholder_token="<cat-toy>" \
--initializer_token="toy" \
--resolution=512 \
--train_batch_size=1 \
--max_train_steps=3000 \
--learning_rate=5.0e-04 \
--scale_lr \
--output_dir="textual_inversion_cat" \
--push_to_hub
```
</hfoption>
</hfoptions>
After training is complete, you can use your newly trained model for inference like:
<hfoptions id="training-inference">
<hfoption id="PyTorch">
```py
from diffusers import StableDiffusionPipeline
import torch
@@ -254,42 +208,6 @@ image = pipeline("A <cat-toy> train", num_inference_steps=50).images[0]
image.save("cat-train.png")
```
</hfoption>
<hfoption id="Flax">
Flax doesn't support the [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] method, but the textual_inversion_flax.py script [saves](https://github.com/huggingface/diffusers/blob/c0f058265161178f2a88849e92b37ffdc81f1dcc/examples/textual_inversion/textual_inversion_flax.py#L636C2-L636C2) the learned embeddings as a part of the model after training. This means you can use the model for inference like any other Flax model:
```py
import jax
import numpy as np
from flax.jax_utils import replicate
from flax.training.common_utils import shard
from diffusers import FlaxStableDiffusionPipeline
model_path = "path-to-your-trained-model"
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(model_path, dtype=jax.numpy.bfloat16)
prompt = "A <cat-toy> train"
prng_seed = jax.random.PRNGKey(0)
num_inference_steps = 50
num_samples = jax.device_count()
prompt = num_samples * [prompt]
prompt_ids = pipeline.prepare_inputs(prompt)
# shard inputs and rng
params = replicate(params)
prng_seed = jax.random.split(prng_seed, jax.device_count())
prompt_ids = shard(prompt_ids)
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
image.save("cat-train.png")
```
</hfoption>
</hfoptions>
## Next steps
Congratulations on training your own Textual Inversion model! 🎉 To learn more about how to use your new model, the following guides may be helpful:
+42 -98
View File
@@ -12,112 +12,56 @@ specific language governing permissions and limitations under the License.
# AutoPipeline
Diffusers provides many pipelines for basic tasks like generating images, videos, audio, and inpainting. On top of these, there are specialized pipelines for adapters and features like upscaling, super-resolution, and more. Different pipeline classes can even use the same checkpoint because they share the same pretrained model! With so many different pipelines, it can be overwhelming to know which pipeline class to use.
[AutoPipeline](../api/models/auto_model) is a *task-and-model* pipeline that automatically selects the correct pipeline subclass based on the task. It handles the complexity of loading different pipeline subclasses without needing to know the specific pipeline subclass name.
The [AutoPipeline](../api/pipelines/auto_pipeline) class is designed to simplify the variety of pipelines in Diffusers. It is a generic *task-first* pipeline that lets you focus on a task ([`AutoPipelineForText2Image`], [`AutoPipelineForImage2Image`], and [`AutoPipelineForInpainting`]) without needing to know the specific pipeline class. The [AutoPipeline](../api/pipelines/auto_pipeline) automatically detects the correct pipeline class to use.
This is unlike [`DiffusionPipeline`], a *model-only* pipeline that automatically selects the pipeline subclass based on the model.
For example, let's use the [dreamlike-art/dreamlike-photoreal-2.0](https://hf.co/dreamlike-art/dreamlike-photoreal-2.0) checkpoint.
Under the hood, [AutoPipeline](../api/pipelines/auto_pipeline):
1. Detects a `"stable-diffusion"` class from the [model_index.json](https://hf.co/dreamlike-art/dreamlike-photoreal-2.0/blob/main/model_index.json) file.
2. Depending on the task you're interested in, it loads the [`StableDiffusionPipeline`], [`StableDiffusionImg2ImgPipeline`], or [`StableDiffusionInpaintPipeline`]. Any parameter (`strength`, `num_inference_steps`, etc.) you would pass to these specific pipelines can also be passed to the [AutoPipeline](../api/pipelines/auto_pipeline).
<hfoptions id="autopipeline">
<hfoption id="text-to-image">
[`AutoPipelineForImage2Image`] returns a specific pipeline subclass, (for example, [`StableDiffusionXLImg2ImgPipeline`]), which can only be used for image-to-image tasks.
```py
from diffusers import AutoPipelineForText2Image
import torch
pipe_txt2img = AutoPipelineForText2Image.from_pretrained(
"dreamlike-art/dreamlike-photoreal-2.0", torch_dtype=torch.float16, use_safetensors=True
).to("cuda")
prompt = "cinematic photo of Godzilla eating sushi with a cat in a izakaya, 35mm photograph, film, professional, 4k, highly detailed"
generator = torch.Generator(device="cpu").manual_seed(37)
image = pipe_txt2img(prompt, generator=generator).images[0]
image
```
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-text2img.png"/>
</div>
</hfoption>
<hfoption id="image-to-image">
```py
from diffusers import AutoPipelineForImage2Image
from diffusers.utils import load_image
import torch
pipe_img2img = AutoPipelineForImage2Image.from_pretrained(
"dreamlike-art/dreamlike-photoreal-2.0", torch_dtype=torch.float16, use_safetensors=True
).to("cuda")
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-text2img.png")
prompt = "cinematic photo of Godzilla eating burgers with a cat in a fast food restaurant, 35mm photograph, film, professional, 4k, highly detailed"
generator = torch.Generator(device="cpu").manual_seed(53)
image = pipe_img2img(prompt, image=init_image, generator=generator).images[0]
image
```
Notice how the [dreamlike-art/dreamlike-photoreal-2.0](https://hf.co/dreamlike-art/dreamlike-photoreal-2.0) checkpoint is used for both text-to-image and image-to-image tasks? To save memory and avoid loading the checkpoint twice, use the [`~DiffusionPipeline.from_pipe`] method.
```py
pipe_img2img = AutoPipelineForImage2Image.from_pipe(pipe_txt2img).to("cuda")
image = pipeline(prompt, image=init_image, generator=generator).images[0]
image
```
You can learn more about the [`~DiffusionPipeline.from_pipe`] method in the [Reuse a pipeline](../using-diffusers/loading#reuse-a-pipeline) guide.
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-img2img.png"/>
</div>
</hfoption>
<hfoption id="inpainting">
```py
from diffusers import AutoPipelineForInpainting
from diffusers.utils import load_image
import torch
pipeline = AutoPipelineForInpainting.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, use_safetensors=True
).to("cuda")
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-img2img.png")
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-mask.png")
prompt = "cinematic photo of a owl, 35mm photograph, film, professional, 4k, highly detailed"
generator = torch.Generator(device="cpu").manual_seed(38)
image = pipeline(prompt, image=init_image, mask_image=mask_image, generator=generator, strength=0.4).images[0]
image
```
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/autopipeline-inpaint.png"/>
</div>
</hfoption>
</hfoptions>
## Unsupported checkpoints
The [AutoPipeline](../api/pipelines/auto_pipeline) supports [Stable Diffusion](../api/pipelines/stable_diffusion/overview), [Stable Diffusion XL](../api/pipelines/stable_diffusion/stable_diffusion_xl), [ControlNet](../api/pipelines/controlnet), [Kandinsky 2.1](../api/pipelines/kandinsky.md), [Kandinsky 2.2](../api/pipelines/kandinsky_v22), and [DeepFloyd IF](../api/pipelines/deepfloyd_if) checkpoints.
If you try to load an unsupported checkpoint, you'll get an error.
```py
from diffusers import AutoPipelineForImage2Image
import torch
pipeline = AutoPipelineForImage2Image.from_pretrained(
"openai/shap-e-img2img", torch_dtype=torch.float16, use_safetensors=True
"RunDiffusion/Juggernaut-XL-v9", torch_dtype=torch.bfloat16, device_map="cuda",
)
print(pipeline)
"StableDiffusionXLImg2ImgPipeline {
"_class_name": "StableDiffusionXLImg2ImgPipeline",
...
"
```
Loading the same model with [`DiffusionPipeline`] returns the [`StableDiffusionXLPipeline`] subclass. It can be used for text-to-image, image-to-image, or inpainting tasks depending on the inputs.
```py
import torch
from diffusers import DiffusionPipeline
pipeline = DiffusionPipeline.from_pretrained(
"RunDiffusion/Juggernaut-XL-v9", torch_dtype=torch.bfloat16, device_map="cuda",
)
print(pipeline)
"StableDiffusionXLPipeline {
"_class_name": "StableDiffusionXLPipeline",
...
"
```
Check the [mappings](https://github.com/huggingface/diffusers/blob/130fd8df54f24ffb006d84787b598d8adc899f23/src/diffusers/pipelines/auto_pipeline.py#L114) to see whether a model is supported or not.
Trying to load an unsupported model returns an error.
```py
import torch
from diffusers import AutoPipelineForImage2Image
pipeline = AutoPipelineForImage2Image.from_pretrained(
"openai/shap-e-img2img", torch_dtype=torch.float16,
)
"ValueError: AutoPipeline can't find a pipeline linked to ShapEImg2ImgPipeline for None"
```
There are three types of [AutoPipeline](../api/models/auto_model) classes, [`AutoPipelineForText2Image`], [`AutoPipelineForImage2Image`] and [`AutoPipelineForInpainting`]. Each of these classes have a predefined mapping, linking a pipeline to their task-specific subclass.
When [`~AutoPipelineForText2Image.from_pretrained`] is called, it extracts the class name from the `model_index.json` file and selects the appropriate pipeline subclass for the task based on the mapping.
@@ -10,13 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
specific language governing permissions and limitations under the License.
-->
# Controlling image quality
The components of a diffusion model, like the UNet and scheduler, can be optimized to improve the quality of generated images leading to better details. These techniques are especially useful if you don't have the resources to simply use a larger model for inference. You can enable these techniques during inference without any additional training.
This guide will show you how to turn these techniques on in your pipeline and how to configure them to improve the quality of your generated images.
## Details
# FreeU
[FreeU](https://hf.co/papers/2309.11497) improves image details by rebalancing the UNet's backbone and skip connection weights. The skip connections can cause the model to overlook some of the backbone semantics which may lead to unnatural image details in the generated image. This technique does not require any additional training and can be applied on the fly during inference for tasks like image-to-image and text-to-video.
@@ -139,7 +133,7 @@ export_to_video(video_frames, "teddy_bear.mp4", fps=10)
</hfoption>
</hfoptions>
Call the [`pipelines.StableDiffusionMixin.disable_freeu`] method to disable FreeU.
Call the [`~pipelines.StableDiffusionMixin.disable_freeu`] method to disable FreeU.
```py
pipeline.disable_freeu()
+6 -33
View File
@@ -108,23 +108,20 @@ print(pipeline.transformer.dtype, pipeline.vae.dtype)
The `device_map` argument determines individual model or pipeline placement on an accelerator like a GPU. It is especially helpful when there are multiple GPUs.
Diffusers currently provides three options to `device_map`, `"cuda"`, `"balanced"` and `"auto"`. Refer to the table below to compare the three placement strategies.
A pipeline supports two options for `device_map`, `"cuda"` and `"balanced"`. Refer to the table below to compare the placement strategies.
| parameter | description |
|---|---|
| `"cuda"` | places model or pipeline on CUDA device |
| `"balanced"` | evenly distributes model or pipeline on all GPUs |
| `"auto"` | distribute model from fastest device first to slowest |
| `"cuda"` | places pipeline on a supported accelerator device like CUDA |
| `"balanced"` | evenly distributes pipeline on all GPUs |
Use the `max_memory` argument in [`~DiffusionPipeline.from_pretrained`] to allocate a maximum amount of memory to use on each device. By default, Diffusers uses the maximum amount available.
<hfoptions id="device_map">
<hfoption id="pipeline">
```py
import torch
from diffusers import DiffusionPipeline
max_memory = {0: "16GB", 1: "16GB"}
pipeline = DiffusionPipeline.from_pretrained(
"Qwen/Qwen-Image",
torch_dtype=torch.bfloat16,
@@ -132,26 +129,6 @@ pipeline = DiffusionPipeline.from_pretrained(
)
```
</hfoption>
<hfoption id="individual model">
```py
import torch
from diffusers import AutoModel
max_memory = {0: "16GB", 1: "16GB"}
transformer = AutoModel.from_pretrained(
"Qwen/Qwen-Image",
subfolder="transformer",
torch_dtype=torch.bfloat16
device_map="cuda",
max_memory=max_memory
)
```
</hfoption>
</hfoptions>
The `hf_device_map` attribute allows you to access and view the `device_map`.
```py
@@ -189,22 +166,18 @@ pipeline = DiffusionPipeline.from_pretrained(
[`DiffusionPipeline`] is flexible and accommodates loading different models or schedulers. You can experiment with different schedulers to optimize for generation speed or quality, and you can replace models with more performant ones.
The example below swaps the default scheduler to generate higher quality images and a more stable VAE version. Pass the `subfolder` argument in [`~HeunDiscreteScheduler.from_pretrained`] to load the scheduler to the correct subfolder.
The example below uses a more stable VAE version.
```py
import torch
from diffusers import DiffusionPipeline, HeunDiscreteScheduler, AutoModel
from diffusers import DiffusionPipeline, AutoModel
scheduler = HeunDiscreteScheduler.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler"
)
vae = AutoModel.from_pretrained(
"madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
)
pipeline = DiffusionPipeline.from_pretrained(
"stabilityai/stable-diffusion-xl-base-1.0",
scheduler=scheduler,
vae=vae,
torch_dtype=torch.float16,
device_map="cuda"
+120
View File
@@ -0,0 +1,120 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
[[open-in-colab]]
# Models
A diffusion model relies on a few individual models working together to generate an output. These models are responsible for denoising, encoding inputs, and decoding latents into the actual outputs.
This guide will show you how to load models.
## Loading a model
All models are loaded with the [`~ModelMixin.from_pretrained`] method, which downloads and caches the latest model version. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache.
Pass the `subfolder` argument to [`~ModelMixin.from_pretrained`] to specify where to load the model weights from. Omit the `subfolder` argument if the repository doesn't have a subfolder structure or if you're loading a standalone model.
```py
from diffusers import QwenImageTransformer2DModel
model = QwenImageTransformer2DModel.from_pretrained("Qwen/Qwen-Image", subfolder="transformer")
```
## AutoModel
[`AutoModel`] detects the model class from a `model_index.json` file or a model's `config.json` file. It fetches the correct model class from these files and delegates the actual loading to the model class. [`AutoModel`] is useful for automatic model type detection without needing to know the exact model class beforehand.
```py
from diffusers import AutoModel
model = AutoModel.from_pretrained(
"Qwen/Qwen-Image", subfolder="transformer"
)
```
## Model data types
Use the `torch_dtype` argument in [`~ModelMixin.from_pretrained`] to load a model with a specific data type. This allows you to load a model in a lower precision to reduce memory usage.
```py
import torch
from diffusers import QwenImageTransformer2DModel
model = QwenImageTransformer2DModel.from_pretrained(
"Qwen/Qwen-Image",
subfolder="transformer",
torch_dtype=torch.bfloat16
)
```
[nn.Module.to](https://docs.pytorch.org/docs/stable/generated/torch.nn.Module.html#torch.nn.Module.to) can also convert to a specific data type on the fly. However, it converts *all* weights to the requested data type unlike `torch_dtype` which respects `_keep_in_fp32_modules`. This argument preserves layers in `torch.float32` for numerical stability and best generation quality (see example [_keep_in_fp32_modules](https://github.com/huggingface/diffusers/blob/f864a9a352fa4a220d860bfdd1782e3e5af96382/src/diffusers/models/transformers/transformer_wan.py#L374))
```py
from diffusers import QwenImageTransformer2DModel
model = QwenImageTransformer2DModel.from_pretrained(
"Qwen/Qwen-Image", subfolder="transformer"
)
model = model.to(dtype=torch.float16)
```
## Device placement
Use the `device_map` argument in [`~ModelMixin.from_pretrained`] to place a model on an accelerator like a GPU. It is especially helpful where there are multiple GPUs.
Diffusers currently provides three options to `device_map` for individual models, `"cuda"`, `"balanced"` and `"auto"`. Refer to the table below to compare the three placement strategies.
| parameter | description |
|---|---|
| `"cuda"` | places pipeline on a supported accelerator (CUDA) |
| `"balanced"` | evenly distributes pipeline on all GPUs |
| `"auto"` | distribute model from fastest device first to slowest |
Use the `max_memory` argument in [`~ModelMixin.from_pretrained`] to allocate a maximum amount of memory to use on each device. By default, Diffusers uses the maximum amount available.
```py
import torch
from diffusers import QwenImagePipeline
max_memory = {0: "16GB", 1: "16GB"}
pipeline = QwenImagePipeline.from_pretrained(
"Qwen/Qwen-Image",
torch_dtype=torch.bfloat16,
device_map="cuda",
max_memory=max_memory
)
```
The `hf_device_map` attribute allows you to access and view the `device_map`.
```py
print(transformer.hf_device_map)
# {'': device(type='cuda')}
```
## Saving models
Save a model with the [`~ModelMixin.save_pretrained`] method.
```py
from diffusers import QwenImageTransformer2DModel
model = QwenImageTransformer2DModel.from_pretrained("Qwen/Qwen-Image", subfolder="transformer")
model.save_pretrained("./local/model")
```
For large models, it is helpful to use `max_shard_size` to save a model as multiple shards. A shard can be loaded faster and save memory (refer to the [parallel loading](./loading#parallel-loading) docs for more details), especially if there is more than one GPU.
```py
model.save_pretrained("./local/model", max_shard_size="5GB")
```
@@ -176,7 +176,7 @@ Benefits of using the Diffusers-multifolder layout include:
).to("cuda")
turbo_pipeline.scheduler = EulerDiscreteScheduler.from_config(
turbo_pipeline.scheduler.config,
timestep+spacing="trailing"
timestep_spacing="trailing"
)
image = turbo_pipeline(
"an astronaut riding a unicorn on mars",
@@ -267,6 +267,7 @@ pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_d
save_folder = "flux-dev"
pipe.save_pretrained("flux-dev")
export_folder_as_dduf("flux-dev.dduf", folder_path=save_folder)
```
> [!TIP]
> Packaging and loading quantized checkpoints in the DDUF format is supported as long as they respect the multi-folder structure.
+28 -21
View File
@@ -10,19 +10,22 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
specific language governing permissions and limitations under the License.
-->
# Push files to the Hub
[[open-in-colab]]
🤗 Diffusers provides a [`~diffusers.utils.PushToHubMixin`] for uploading your model, scheduler, or pipeline to the Hub. It is an easy way to store your files on the Hub, and also allows you to share your work with others. Under the hood, the [`~diffusers.utils.PushToHubMixin`]:
# Sharing pipelines and models
Share your pipeline or models and schedulers on the Hub with the [`~diffusers.utils.PushToHubMixin`] class. This class:
1. creates a repository on the Hub
2. saves your model, scheduler, or pipeline files so they can be reloaded later
3. uploads folder containing these files to the Hub
This guide will show you how to use the [`~diffusers.utils.PushToHubMixin`] to upload your files to the Hub.
This guide will show you how to upload your files to the Hub with the [`~diffusers.utils.PushToHubMixin`] class.
You'll need to log in to your Hub account with your access [token](https://huggingface.co/settings/tokens) first:
Log in to your Hugging Face account with your access [token](https://huggingface.co/settings/tokens).
<hfoptions id="login">
<hfoption id="notebook">
```py
from huggingface_hub import notebook_login
@@ -30,9 +33,19 @@ from huggingface_hub import notebook_login
notebook_login()
```
</hfoption>
<hfoption id="hf CLI">
```bash
hf auth login
```
</hfoption>
</hfoptions>
## Models
To push a model to the Hub, call [`~diffusers.utils.PushToHubMixin.push_to_hub`] and specify the repository id of the model to be stored on the Hub:
To push a model to the Hub, call [`~diffusers.utils.PushToHubMixin.push_to_hub`] and specify the repository id of the model.
```py
from diffusers import ControlNetModel
@@ -48,15 +61,9 @@ controlnet = ControlNetModel(
controlnet.push_to_hub("my-controlnet-model")
```
For models, you can also specify the [*variant*](loading#checkpoint-variants) of the weights to push to the Hub. For example, to push `fp16` weights:
The [`~diffusers.utils.PushToHubMixin.push_to_hub`] method saves the model's `config.json` file and the weights are automatically saved as safetensors files.
```py
controlnet.push_to_hub("my-controlnet-model", variant="fp16")
```
The [`~diffusers.utils.PushToHubMixin.push_to_hub`] function saves the model's `config.json` file and the weights are automatically saved in the `safetensors` format.
Now you can reload the model from your repository on the Hub:
Load the model again with [`~DiffusionPipeline.from_pretrained`].
```py
model = ControlNetModel.from_pretrained("your-namespace/my-controlnet-model")
@@ -64,7 +71,7 @@ model = ControlNetModel.from_pretrained("your-namespace/my-controlnet-model")
## Scheduler
To push a scheduler to the Hub, call [`~diffusers.utils.PushToHubMixin.push_to_hub`] and specify the repository id of the scheduler to be stored on the Hub:
To push a scheduler to the Hub, call [`~diffusers.utils.PushToHubMixin.push_to_hub`] and specify the repository id of the scheduler.
```py
from diffusers import DDIMScheduler
@@ -81,7 +88,7 @@ scheduler.push_to_hub("my-controlnet-scheduler")
The [`~diffusers.utils.PushToHubMixin.push_to_hub`] function saves the scheduler's `scheduler_config.json` file to the specified repository.
Now you can reload the scheduler from your repository on the Hub:
Load the scheduler again with [`~SchedulerMixin.from_pretrained`].
```py
scheduler = DDIMScheduler.from_pretrained("your-namepsace/my-controlnet-scheduler")
@@ -89,7 +96,7 @@ scheduler = DDIMScheduler.from_pretrained("your-namepsace/my-controlnet-schedule
## Pipeline
You can also push an entire pipeline with all it's components to the Hub. For example, initialize the components of a [`StableDiffusionPipeline`] with the parameters you want:
To push a pipeline to the Hub, initialize the pipeline components with your desired parameters.
```py
from diffusers import (
@@ -143,7 +150,7 @@ text_encoder = CLIPTextModel(text_encoder_config)
tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
```
Pass all of the components to the [`StableDiffusionPipeline`] and call [`~diffusers.utils.PushToHubMixin.push_to_hub`] to push the pipeline to the Hub:
Pass all components to the pipeline and call [`~diffusers.utils.PushToHubMixin.push_to_hub`].
```py
components = {
@@ -160,7 +167,7 @@ pipeline = StableDiffusionPipeline(**components)
pipeline.push_to_hub("my-pipeline")
```
The [`~diffusers.utils.PushToHubMixin.push_to_hub`] function saves each component to a subfolder in the repository. Now you can reload the pipeline from your repository on the Hub:
The [`~diffusers.utils.PushToHubMixin.push_to_hub`] method saves each component to a subfolder in the repository. Load the pipeline again with [`~DiffusionPipeline.from_pretrained`].
```py
pipeline = StableDiffusionPipeline.from_pretrained("your-namespace/my-pipeline")
@@ -168,10 +175,10 @@ pipeline = StableDiffusionPipeline.from_pretrained("your-namespace/my-pipeline")
## Privacy
Set `private=True` in the [`~diffusers.utils.PushToHubMixin.push_to_hub`] function to keep your model, scheduler, or pipeline files private:
Set `private=True` in [`~diffusers.utils.PushToHubMixin.push_to_hub`] to keep a model, scheduler, or pipeline files private.
```py
controlnet.push_to_hub("my-controlnet-model-private", private=True)
```
Private repositories are only visible to you, and other users won't be able to clone the repository and your repository won't appear in search results. Even if a user has the URL to your private repository, they'll receive a `404 - Sorry, we can't find the page you are looking for`. You must be [logged in](https://huggingface.co/docs/huggingface_hub/quick-start#login) to load a model from a private repository.
Private repositories are only visible to you. Other users won't be able to clone the repository and it won't appear in search results. Even if a user has the URL to your private repository, they'll receive a `404 - Sorry, we can't find the page you are looking for`. You must be [logged in](https://huggingface.co/docs/huggingface_hub/quick-start#login) to load a model from a private repository.
+55 -98
View File
@@ -10,129 +10,86 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
specific language governing permissions and limitations under the License.
-->
# Reproducible pipelines
# Reproducibility
Diffusion models are inherently random which is what allows it to generate different outputs every time it is run. But there are certain times when you want to generate the same output every time, like when you're testing, replicating results, and even [improving image quality](#deterministic-batch-generation). While you can't expect to get identical results across platforms, you can expect reproducible results across releases and platforms within a certain tolerance range (though even this may vary).
Diffusion is a random process that generates a different output every time. For certain situations like testing and replicating results, you want to generate the same result each time, across releases and platforms within a certain tolerance range.
This guide will show you how to control randomness for deterministic generation on a CPU and GPU.
This guide will show you how to control sources of randomness and enable deterministic algorithms.
## Generator
Pipelines rely on [torch.randn](https://pytorch.org/docs/stable/generated/torch.randn.html), which uses a different random seed each time, to create the initial noisy tensors. To generate the same output on a CPU or GPU, use a [Generator](https://docs.pytorch.org/docs/stable/generated/torch.Generator.html) to manage how random values are generated.
> [!TIP]
> We strongly recommend reading PyTorch's [statement about reproducibility](https://pytorch.org/docs/stable/notes/randomness.html):
>
> "Completely reproducible results are not guaranteed across PyTorch releases, individual commits, or different platforms. Furthermore, results may not be reproducible between CPU and GPU executions, even when using identical seeds."
> If reproducibility is important to your use case, we recommend always using a CPU `Generator`. The performance loss is often negligible and you'll generate more similar values.
## Control randomness
<hfoptions id="generator">
<hfoption id="GPU">
During inference, pipelines rely heavily on random sampling operations which include creating the
Gaussian noise tensors to denoise and adding noise to the scheduling step.
The GPU uses a different random number generator than the CPU. Diffusers solves this issue with the [`~utils.torch_utils.randn_tensor`] function to create the random tensor on a CPU and then moving it to the GPU. This function is used everywhere inside the pipeline and you don't need to explicitly call it.
Take a look at the tensor values in the [`DDIMPipeline`] after two inference steps.
Use [manual_seed](https://docs.pytorch.org/docs/stable/generated/torch.manual_seed.html) as shown below to set a seed.
```python
from diffusers import DDIMPipeline
import numpy as np
ddim = DDIMPipeline.from_pretrained( "google/ddpm-cifar10-32", use_safetensors=True)
image = ddim(num_inference_steps=2, output_type="np").images
print(np.abs(image).sum())
```
Running the code above prints one value, but if you run it again you get a different value.
Each time the pipeline is run, [torch.randn](https://pytorch.org/docs/stable/generated/torch.randn.html) uses a different random seed to create the Gaussian noise tensors. This leads to a different result each time it is run and enables the diffusion pipeline to generate a different random image each time.
But if you need to reliably generate the same image, that depends on whether you're running the pipeline on a CPU or GPU.
> [!TIP]
> It might seem unintuitive to pass `Generator` objects to a pipeline instead of the integer value representing the seed. However, this is the recommended design when working with probabilistic models in PyTorch because a `Generator` is a *random state* that can be passed to multiple pipelines in a sequence. As soon as the `Generator` is consumed, the *state* is changed in place which means even if you passed the same `Generator` to a different pipeline, it won't produce the same result because the state is already changed.
<hfoptions id="hardware">
<hfoption id="CPU">
To generate reproducible results on a CPU, you'll need to use a PyTorch [Generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed. Now when you run the code, it always prints a value of `1491.1711` because the `Generator` object with the seed is passed to all the random functions in the pipeline. You should get a similar, if not the same, result on whatever hardware and PyTorch version you're using.
```python
```py
import torch
import numpy as np
from diffusers import DDIMPipeline
ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", device_map="cuda")
generator = torch.manual_seed(0)
image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
print(np.abs(image).sum())
```
</hfoption>
<hfoption id="CPU">
Set `device="cpu"` in the `Generator` and use [manual_seed](https://docs.pytorch.org/docs/stable/generated/torch.manual_seed.html) to set a seed for generating random numbers.
```py
import torch
import numpy as np
from diffusers import DDIMPipeline
ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32")
generator = torch.Generator(device="cpu").manual_seed(0)
image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
print(np.abs(image).sum())
```
</hfoption>
<hfoption id="GPU">
Writing a reproducible pipeline on a GPU is a bit trickier, and full reproducibility across different hardware is not guaranteed because matrix multiplication - which diffusion pipelines require a lot of - is less deterministic on a GPU than a CPU. For example, if you run the same code example from the CPU example, you'll get a different result even though the seed is identical. This is because the GPU uses a different random number generator than the CPU.
```python
import torch
import numpy as np
from diffusers import DDIMPipeline
ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
ddim.to("cuda")
generator = torch.Generator(device="cuda").manual_seed(0)
image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
print(np.abs(image).sum())
```
To avoid this issue, Diffusers has a [`~utils.torch_utils.randn_tensor`] function for creating random noise on the CPU, and then moving the tensor to a GPU if necessary. The [`~utils.torch_utils.randn_tensor`] function is used everywhere inside the pipeline. Now you can call [torch.manual_seed](https://pytorch.org/docs/stable/generated/torch.manual_seed.html) which automatically creates a CPU `Generator` that can be passed to the pipeline even if it is being run on a GPU.
```python
import torch
import numpy as np
from diffusers import DDIMPipeline
ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
ddim.to("cuda")
generator = torch.manual_seed(0)
image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
print(np.abs(image).sum())
```
> [!TIP]
> If reproducibility is important to your use case, we recommend always passing a CPU `Generator`. The performance loss is often negligible and you'll generate more similar values than if the pipeline had been run on a GPU.
Finally, more complex pipelines such as [`UnCLIPPipeline`], are often extremely
susceptible to precision error propagation. You'll need to use
exactly the same hardware and PyTorch version for full reproducibility.
</hfoption>
</hfoptions>
The `Generator` object should be passed to the pipeline instead of an integer seed. `Generator` maintains a *random state* that is consumed and modified when used. Once consumed, the same `Generator` object produces different results in subsequent calls, even across different pipelines, because it's *state* has changed.
```py
generator = torch.manual_seed(0)
for _ in range(5):
- image = pipeline(prompt, generator=generator)
+ image = pipeline(prompt, generator=torch.manual_seed(0))
```
## Deterministic algorithms
You can also configure PyTorch to use deterministic algorithms to create a reproducible pipeline. The downside is that deterministic algorithms may be slower than non-deterministic ones and you may observe a decrease in performance.
PyTorch supports [deterministic algorithms](https://docs.pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms) - where available - for certain operations so they produce the same results. Deterministic algorithms may be slower and decrease performance.
Non-deterministic behavior occurs when operations are launched in more than one CUDA stream. To avoid this, set the environment variable [CUBLAS_WORKSPACE_CONFIG](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility) to `:16:8` to only use one buffer size during runtime.
PyTorch typically benchmarks multiple algorithms to select the fastest one, but if you want reproducibility, you should disable this feature because the benchmark may select different algorithms each time. Set Diffusers [enable_full_determinism](https://github.com/huggingface/diffusers/blob/142f353e1c638ff1d20bd798402b68f72c1ebbdd/src/diffusers/utils/testing_utils.py#L861) to enable deterministic algorithms.
```py
enable_full_determinism()
```
Now when you run the same pipeline twice, you'll get identical results.
Use Diffusers' [enable_full_determinism](https://github.com/huggingface/diffusers/blob/142f353e1c638ff1d20bd798402b68f72c1ebbdd/src/diffusers/utils/testing_utils.py#L861) function to enable deterministic algorithms.
```py
import torch
from diffusers import DDIMScheduler, StableDiffusionPipeline
from diffusers_utils import enable_full_determinism
pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", use_safetensors=True).to("cuda")
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
g = torch.Generator(device="cuda")
prompt = "A bear is playing a guitar on Times Square"
g.manual_seed(0)
result1 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
g.manual_seed(0)
result2 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
print("L_inf dist =", abs(result1 - result2).max())
"L_inf dist = tensor(0., device='cuda:0')"
enable_full_determinism()
```
Under the hood, `enable_full_determinism` works by:
- Setting the environment variable [CUBLAS_WORKSPACE_CONFIG](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility) to `:16:8` to only use one buffer size during rntime. Non-deterministic behavior occurs when operations are used in more than one CUDA stream.
- Disabling benchmarking to find the fastest convolution operation by setting `torch.backends.cudnn.benchmark=False`. Non-deterministic behavior occurs because the benchmark may select different algorithms each time depending on hardware or benchmarking noise.
- Disabling TensorFloat32 (TF32) operations in favor of more precise and consistent full-precision operations.
## Resources
We strongly recommend reading PyTorch's developer notes about [Reproducibility](https://docs.pytorch.org/docs/stable/notes/randomness.html). You can try to limit randomness, but it is not *guaranteed* even with an identical seed.
@@ -165,53 +165,6 @@ image
Most images look very similar and are comparable in quality. Again, it often comes down to your specific use case so a good approach is to run multiple different schedulers and compare the results.
### Flax schedulers
To compare Flax schedulers, you need to additionally load the scheduler state into the model parameters. For example, let's change the default scheduler in [`FlaxStableDiffusionPipeline`] to use the super fast [`FlaxDPMSolverMultistepScheduler`].
> [!WARNING]
> The [`FlaxLMSDiscreteScheduler`] and [`FlaxDDPMScheduler`] are not compatible with the [`FlaxStableDiffusionPipeline`] yet.
```py
import jax
import numpy as np
from flax.jax_utils import replicate
from flax.training.common_utils import shard
from diffusers import FlaxStableDiffusionPipeline, FlaxDPMSolverMultistepScheduler
scheduler, scheduler_state = FlaxDPMSolverMultistepScheduler.from_pretrained(
"stable-diffusion-v1-5/stable-diffusion-v1-5",
subfolder="scheduler"
)
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
"stable-diffusion-v1-5/stable-diffusion-v1-5",
scheduler=scheduler,
variant="bf16",
dtype=jax.numpy.bfloat16,
)
params["scheduler"] = scheduler_state
```
Then you can take advantage of Flax's compatibility with TPUs to generate a number of images in parallel. You'll need to make a copy of the model parameters for each available device and then split the inputs across them to generate your desired number of images.
```py
# Generate 1 image per parallel device (8 on TPUv2-8 or TPUv3-8)
prompt = "A photograph of an astronaut riding a horse on Mars, high resolution, high definition."
num_samples = jax.device_count()
prompt_ids = pipeline.prepare_inputs([prompt] * num_samples)
prng_seed = jax.random.PRNGKey(0)
num_inference_steps = 25
# shard inputs and rng
params = replicate(params)
prng_seed = jax.random.split(prng_seed, jax.device_count())
prompt_ids = shard(prompt_ids)
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
```
## Models
Models are loaded from the [`ModelMixin.from_pretrained`] method, which downloads and caches the latest version of the model weights and configurations. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache instead of re-downloading them.
@@ -1,225 +0,0 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
# JAX/Flax
[[open-in-colab]]
🤗 Diffusers supports Flax for super fast inference on Google TPUs, such as those available in Colab, Kaggle or Google Cloud Platform. This guide shows you how to run inference with Stable Diffusion using JAX/Flax.
Before you begin, make sure you have the necessary libraries installed:
```py
# uncomment to install the necessary libraries in Colab
#!pip install -q jax==0.3.25 jaxlib==0.3.25 flax transformers ftfy
#!pip install -q diffusers
```
You should also make sure you're using a TPU backend. While JAX does not run exclusively on TPUs, you'll get the best performance on a TPU because each server has 8 TPU accelerators working in parallel.
If you are running this guide in Colab, select *Runtime* in the menu above, select the option *Change runtime type*, and then select *TPU* under the *Hardware accelerator* setting. Import JAX and quickly check whether you're using a TPU:
```python
import jax
import jax.tools.colab_tpu
jax.tools.colab_tpu.setup_tpu()
num_devices = jax.device_count()
device_type = jax.devices()[0].device_kind
print(f"Found {num_devices} JAX devices of type {device_type}.")
assert (
"TPU" in device_type,
"Available device is not a TPU, please select TPU from Runtime > Change runtime type > Hardware accelerator"
)
# Found 8 JAX devices of type Cloud TPU.
```
Great, now you can import the rest of the dependencies you'll need:
```python
import jax.numpy as jnp
from jax import pmap
from flax.jax_utils import replicate
from flax.training.common_utils import shard
from diffusers import FlaxStableDiffusionPipeline
```
## Load a model
Flax is a functional framework, so models are stateless and parameters are stored outside of them. Loading a pretrained Flax pipeline returns *both* the pipeline and the model weights (or parameters). In this guide, you'll use `bfloat16`, a more efficient half-float type that is supported by TPUs (you can also use `float32` for full precision if you want).
```python
dtype = jnp.bfloat16
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
"CompVis/stable-diffusion-v1-4",
variant="bf16",
dtype=dtype,
)
```
## Inference
TPUs usually have 8 devices working in parallel, so let's use the same prompt for each device. This means you can perform inference on 8 devices at once, with each device generating one image. As a result, you'll get 8 images in the same amount of time it takes for one chip to generate a single image!
<Tip>
Learn more details in the [How does parallelization work?](#how-does-parallelization-work) section.
</Tip>
After replicating the prompt, get the tokenized text ids by calling the `prepare_inputs` function on the pipeline. The length of the tokenized text is set to 77 tokens as required by the configuration of the underlying CLIP text model.
```python
prompt = "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of field, close up, split lighting, cinematic"
prompt = [prompt] * jax.device_count()
prompt_ids = pipeline.prepare_inputs(prompt)
prompt_ids.shape
# (8, 77)
```
Model parameters and inputs have to be replicated across the 8 parallel devices. The parameters dictionary is replicated with [`flax.jax_utils.replicate`](https://flax.readthedocs.io/en/latest/api_reference/flax.jax_utils.html#flax.jax_utils.replicate) which traverses the dictionary and changes the shape of the weights so they are repeated 8 times. Arrays are replicated using `shard`.
```python
# parameters
p_params = replicate(params)
# arrays
prompt_ids = shard(prompt_ids)
prompt_ids.shape
# (8, 1, 77)
```
This shape means each one of the 8 devices receives as an input a `jnp` array with shape `(1, 77)`, where `1` is the batch size per device. On TPUs with sufficient memory, you could have a batch size larger than `1` if you want to generate multiple images (per chip) at once.
Next, create a random number generator to pass to the generation function. This is standard procedure in Flax, which is very serious and opinionated about random numbers. All functions that deal with random numbers are expected to receive a generator to ensure reproducibility, even when you're training across multiple distributed devices.
The helper function below uses a seed to initialize a random number generator. As long as you use the same seed, you'll get the exact same results. Feel free to use different seeds when exploring results later in the guide.
```python
def create_key(seed=0):
return jax.random.PRNGKey(seed)
```
The helper function, or `rng`, is split 8 times so each device receives a different generator and generates a different image.
```python
rng = create_key(0)
rng = jax.random.split(rng, jax.device_count())
```
To take advantage of JAX's optimized speed on a TPU, pass `jit=True` to the pipeline to compile the JAX code into an efficient representation and to ensure the model runs in parallel across the 8 devices.
<Tip warning={true}>
You need to ensure all your inputs have the same shape in subsequent calls, otherwise JAX will need to recompile the code which is slower.
</Tip>
The first inference run takes more time because it needs to compile the code, but subsequent calls (even with different inputs) are much faster. For example, it took more than a minute to compile on a TPU v2-8, but then it takes about **7s** on a future inference run!
```py
%%time
images = pipeline(prompt_ids, p_params, rng, jit=True)[0]
# CPU times: user 56.2 s, sys: 42.5 s, total: 1min 38s
# Wall time: 1min 29s
```
The returned array has shape `(8, 1, 512, 512, 3)` which should be reshaped to remove the second dimension and get 8 images of `512 × 512 × 3`. Then you can use the [`~utils.numpy_to_pil`] function to convert the arrays into images.
```python
from diffusers.utils import make_image_grid
images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
images = pipeline.numpy_to_pil(images)
make_image_grid(images, rows=2, cols=4)
```
![img](https://huggingface.co/datasets/YiYiXu/test-doc-assets/resolve/main/stable_diffusion_jax_how_to_cell_38_output_0.jpeg)
## Using different prompts
You don't necessarily have to use the same prompt on all devices. For example, to generate 8 different prompts:
```python
prompts = [
"Labrador in the style of Hokusai",
"Painting of a squirrel skating in New York",
"HAL-9000 in the style of Van Gogh",
"Times Square under water, with fish and a dolphin swimming around",
"Ancient Roman fresco showing a man working on his laptop",
"Close-up photograph of young black woman against urban background, high quality, bokeh",
"Armchair in the shape of an avocado",
"Clown astronaut in space, with Earth in the background",
]
prompt_ids = pipeline.prepare_inputs(prompts)
prompt_ids = shard(prompt_ids)
images = pipeline(prompt_ids, p_params, rng, jit=True).images
images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
images = pipeline.numpy_to_pil(images)
make_image_grid(images, 2, 4)
```
![img](https://huggingface.co/datasets/YiYiXu/test-doc-assets/resolve/main/stable_diffusion_jax_how_to_cell_43_output_0.jpeg)
## How does parallelization work?
The Flax pipeline in 🤗 Diffusers automatically compiles the model and runs it in parallel on all available devices. Let's take a closer look at how that process works.
JAX parallelization can be done in multiple ways. The easiest one revolves around using the [`jax.pmap`](https://jax.readthedocs.io/en/latest/_autosummary/jax.pmap.html) function to achieve single-program multiple-data (SPMD) parallelization. It means running several copies of the same code, each on different data inputs. More sophisticated approaches are possible, and you can go over to the JAX [documentation](https://jax.readthedocs.io/en/latest/index.html) to explore this topic in more detail if you are interested!
`jax.pmap` does two things:
1. Compiles (or "`jit`s") the code which is similar to `jax.jit()`. This does not happen when you call `pmap`, and only the first time the `pmap`ped function is called.
2. Ensures the compiled code runs in parallel on all available devices.
To demonstrate, call `pmap` on the pipeline's `_generate` method (this is a private method that generates images and may be renamed or removed in future releases of 🤗 Diffusers):
```python
p_generate = pmap(pipeline._generate)
```
After calling `pmap`, the prepared function `p_generate` will:
1. Make a copy of the underlying function, `pipeline._generate`, on each device.
2. Send each device a different portion of the input arguments (this is why it's necessary to call the *shard* function). In this case, `prompt_ids` has shape `(8, 1, 77, 768)` so the array is split into 8 and each copy of `_generate` receives an input with shape `(1, 77, 768)`.
The most important thing to pay attention to here is the batch size (1 in this example), and the input dimensions that make sense for your code. You don't have to change anything else to make the code work in parallel.
The first time you call the pipeline takes more time, but the calls afterward are much faster. The `block_until_ready` function is used to correctly measure inference time because JAX uses asynchronous dispatch and returns control to the Python loop as soon as it can. You don't need to use that in your code; blocking occurs automatically when you want to use the result of a computation that has not yet been materialized.
```py
%%time
images = p_generate(prompt_ids, p_params, rng)
images = images.block_until_ready()
# CPU times: user 1min 15s, sys: 18.2 s, total: 1min 34s
# Wall time: 1min 15s
```
Check your image dimensions to see if they're correct:
```python
images.shape
# (8, 1, 512, 512, 3)
```
## Resources
To learn more about how JAX works with Stable Diffusion, you may be interested in reading:
* [Accelerating Stable Diffusion XL Inference with JAX on Cloud TPU v5e](https://hf.co/blog/sdxl_jax)
@@ -98,7 +98,7 @@ pipeline_quant_config = PipelineQuantizationConfig(
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_compute_dtype": torch.bfloat16
},
components_to_quantize=["transformer"]
components_to_quantize="transformer"
)
pipeline = HunyuanVideoPipeline.from_pretrained(
@@ -287,7 +287,7 @@ export_to_video(output, "output.mp4", fps=16)
## Reduce memory usage
Recent video models like [`HunyuanVideoPipeline`] and [`WanPipeline`], which have 10B+ parameters, require a lot of memory and it often exceeds the memory availabe on consumer hardware. Diffusers offers several techniques for reducing the memory requirements of these large models.
Recent video models like [`HunyuanVideoPipeline`] and [`WanPipeline`], which have 10B+ parameters, require a lot of memory and it often exceeds the memory available on consumer hardware. Diffusers offers several techniques for reducing the memory requirements of these large models.
> [!TIP]
> Refer to the [Reduce memory usage](../optimization/memory) guide for more details about other memory saving techniques.
@@ -48,10 +48,10 @@ t2i_pipeline = t2i_blocks.init_pipeline(modular_repo_id, components_manager=comp
</hfoption>
</hfoptions>
组件仅在调用 [`~ModularPipeline.load_components`] 或 [`~ModularPipeline.load_default_components`] 时加载和注册。以下示例使用 [`~ModularPipeline.load_default_components`] 创建第二个管道,重用第一个管道的所有组件,并将其分配到不同的集合。
组件仅在调用 [`~ModularPipeline.load_components`] 或 [`~ModularPipeline.load_components`] 时加载和注册。以下示例使用 [`~ModularPipeline.load_components`] 创建第二个管道,重用第一个管道的所有组件,并将其分配到不同的集合。
```py
pipe.load_default_components()
pipe.load_components()
pipe2 = ModularPipeline.from_pretrained("YiYiXu/modular-demo-auto", components_manager=comp, collection="test2")
```
@@ -185,4 +185,4 @@ comp.enable_auto_cpu_offload(device="cuda")
所有模型开始时都在 CPU 上,[`ComponentsManager`] 在需要它们之前将它们移动到适当的设备,并在 GPU 内存不足时将其他模型移回 CPU。
您可以设置自己的规则来决定哪些模型要卸载。
您可以设置自己的规则来决定哪些模型要卸载。
+3 -3
View File
@@ -73,13 +73,13 @@ ComponentSpec(name='guider', type_hint=<class 'diffusers.guiders.perturbed_atten
}
```
引导器只有在调用 [`~ModularPipeline.load_default_components`] 之后才会创建,基于 `modular_model_index.json` 中的加载规范。
引导器只有在调用 [`~ModularPipeline.load_components`] 之后才会创建,基于 `modular_model_index.json` 中的加载规范。
```py
t2i_pipeline = t2i_blocks.init_pipeline("YiYiXu/modular-doc-guider")
# 在初始化时未创建
assert t2i_pipeline.guider is None
t2i_pipeline.load_default_components()
t2i_pipeline.load_components()
# 加载为 PAG 引导器
t2i_pipeline.guider
```
@@ -170,4 +170,4 @@ t2i_pipeline.push_to_hub("YiYiXu/modular-doc-guider")
```
</hfoption>
</hfoptions>
</hfoptions>
@@ -28,7 +28,7 @@ blocks = SequentialPipelineBlocks.from_blocks_dict(TEXT2IMAGE_BLOCKS)
modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
pipeline = blocks.init_pipeline(modular_repo_id)
pipeline.load_default_components(torch_dtype=torch.float16)
pipeline.load_components(torch_dtype=torch.float16)
pipeline.to("cuda")
image = pipeline(prompt="Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", output="images")[0]
@@ -48,7 +48,7 @@ blocks = SequentialPipelineBlocks.from_blocks_dict(IMAGE2IMAGE_BLOCKS)
modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
pipeline = blocks.init_pipeline(modular_repo_id)
pipeline.load_default_components(torch_dtype=torch.float16)
pipeline.load_components(torch_dtype=torch.float16)
pipeline.to("cuda")
url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-text2img.png"
@@ -72,7 +72,7 @@ blocks = SequentialPipelineBlocks.from_blocks_dict(INPAINT_BLOCKS)
modular_repo_id = "YiYiXu/modular-loader-t2i-0704"
pipeline = blocks.init_pipeline(modular_repo_id)
pipeline.load_default_components(torch_dtype=torch.float16)
pipeline.load_components(torch_dtype=torch.float16)
pipeline.to("cuda")
img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl-text2img.png"
@@ -176,15 +176,15 @@ diffdiff_pipeline = ModularPipeline.from_pretrained(modular_repo_id, trust_remot
## 加载组件
一个[`ModularPipeline`]不会自动实例化组件。它只加载配置和组件规范。您可以使用[`~ModularPipeline.load_default_components`]加载所有组件,或仅使用[`~ModularPipeline.load_components`]加载特定组件。
一个[`ModularPipeline`]不会自动实例化组件。它只加载配置和组件规范。您可以使用[`~ModularPipeline.load_components`]加载所有组件,或仅使用[`~ModularPipeline.load_components`]加载特定组件。
<hfoptions id="load">
<hfoption id="load_default_components">
<hfoption id="load_components">
```py
import torch
t2i_pipeline.load_default_components(torch_dtype=torch.float16)
t2i_pipeline.load_components(torch_dtype=torch.float16)
t2i_pipeline.to("cuda")
```
@@ -175,7 +175,7 @@ print(dd_blocks)
将 [`SequentialPipelineBlocks`] 转换为 [`ModularPipeline`],使用 [`ModularPipeline.init_pipeline`] 方法。这会初始化从 `modular_model_index.json` 文件加载的预期组件。通过调用 [`ModularPipeline.load_defau
lt_components`]。
初始化[`ComponentManager`]时传入pipeline是一个好主意,以帮助管理不同的组件。一旦调用[`~ModularPipeline.load_default_components`],组件就会被注册到[`ComponentManager`]中,并且可以在工作流之间共享。下面的例子使用`collection`参数为组件分配了一个`"diffdiff"`标签,以便更好地组织。
初始化[`ComponentManager`]时传入pipeline是一个好主意,以帮助管理不同的组件。一旦调用[`~ModularPipeline.load_components`],组件就会被注册到[`ComponentManager`]中,并且可以在工作流之间共享。下面的例子使用`collection`参数为组件分配了一个`"diffdiff"`标签,以便更好地组织。
```py
from diffusers.modular_pipelines import ComponentsManager
@@ -209,11 +209,11 @@ ip_adapter_block = StableDiffusionXLAutoIPAdapterStep()
dd_blocks.sub_blocks.insert("ip_adapter", ip_adapter_block, 0)
```
调用[`~ModularPipeline.init_pipeline`]来初始化一个[`ModularPipeline`],并使用[`~ModularPipeline.load_default_components`]加载模型组件。加载并设置IP-Adapter以运行pipeline。
调用[`~ModularPipeline.init_pipeline`]来初始化一个[`ModularPipeline`],并使用[`~ModularPipeline.load_components`]加载模型组件。加载并设置IP-Adapter以运行pipeline。
```py
dd_pipeline = dd_blocks.init_pipeline("YiYiXu/modular-demo-auto", collection="diffdiff")
dd_pipeline.load_default_components(torch_dtype=torch.float16)
dd_pipeline.load_components(torch_dtype=torch.float16)
dd_pipeline.loader.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
dd_pipeline.loader.set_ip_adapter_scale(0.6)
dd_pipeline = dd_pipeline.to(device)
@@ -261,14 +261,14 @@ class SDXLDiffDiffControlNetDenoiseStep(StableDiffusionXLDenoiseLoopWrapper):
controlnet_denoise_block = SDXLDiffDiffControlNetDenoiseStep()
```
插入 `controlnet_input` 块并用新的 `controlnet_denoise_block` 替换 `denoise` 块。初始化一个 [`ModularPipeline`] 并将 [`~ModularPipeline.load_default_components`] 加载到其中。
插入 `controlnet_input` 块并用新的 `controlnet_denoise_block` 替换 `denoise` 块。初始化一个 [`ModularPipeline`] 并将 [`~ModularPipeline.load_components`] 加载到其中。
```py
dd_blocks.sub_blocks.insert("controlnet_input", control_input_block, 7)
dd_blocks.sub_blocks["denoise"] = controlnet_denoise_block
dd_pipeline = dd_blocks.init_pipeline("YiYiXu/modular-demo-auto", collection="diffdiff")
dd_pipeline.load_default_components(torch_dtype=torch.float16)
dd_pipeline.load_components(torch_dtype=torch.float16)
dd_pipeline = dd_pipeline.to(device)
control_image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/diffdiff_tomato_canny.jpeg")
@@ -322,7 +322,7 @@ DIFFDIFF_AUTO_BLOCKS.insert("controlnet_input",StableDiffusionXLControlNetAutoIn
```py
dd_auto_blocks = SequentialPipelineBlocks.from_blocks_dict(DIFFDIFF_AUTO_BLOCKS)
dd_pipeline = dd_auto_blocks.init_pipeline("YiYiXu/modular-demo-auto", collection="diffdiff")
dd_pipeline.load_default_components(torch_dtype=torch.float16)
dd_pipeline.load_components(torch_dtype=torch.float16)
```
## 分享
@@ -342,5 +342,5 @@ from diffusers.modular_pipelines import ModularPipeline, ComponentsManager
components = ComponentsManager()
diffdiff_pipeline = ModularPipeline.from_pretrained("YiYiXu/modular-diffdiff-0704", trust_remote_code=True, components_manager=components, collection="diffdiff")
diffdiff_pipeline.load_default_components(torch_dtype=torch.float16)
diffdiff_pipeline.load_components(torch_dtype=torch.float16)
```
@@ -223,7 +223,7 @@ from diffusers.image_processor import VaeImageProcessor
import torch
vae = AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=torch.bfloat16).to("cuda")
vae_scale_factor = 2 ** (len(vae.config.block_out_channels))
vae_scale_factor = 2 ** (len(vae.config.block_out_channels) - 1)
image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)
with torch.no_grad():
@@ -1399,6 +1399,7 @@ def main(args):
torch_dtype = torch.float16
elif args.prior_generation_precision == "bf16":
torch_dtype = torch.bfloat16
pipeline = FluxPipeline.from_pretrained(
args.pretrained_model_name_or_path,
torch_dtype=torch_dtype,
@@ -1419,7 +1420,8 @@ def main(args):
for example in tqdm(
sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
):
images = pipeline(example["prompt"]).images
with torch.autocast(device_type=accelerator.device.type, dtype=torch_dtype):
images = pipeline(prompt=example["prompt"]).images
for i, image in enumerate(images):
hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
+2
View File
@@ -88,6 +88,8 @@ PIXART-α Controlnet pipeline | Implementation of the controlnet model for pixar
| FaithDiff Stable Diffusion XL Pipeline | Implementation of [(CVPR 2025) FaithDiff: Unleashing Diffusion Priors for Faithful Image Super-resolutionUnleashing Diffusion Priors for Faithful Image Super-resolution](https://huggingface.co/papers/2411.18824) - FaithDiff is a faithful image super-resolution method that leverages latent diffusion models by actively adapting the diffusion prior and jointly fine-tuning its components (encoder and diffusion model) with an alignment module to ensure high fidelity and structural consistency. | [FaithDiff Stable Diffusion XL Pipeline](#faithdiff-stable-diffusion-xl-pipeline) | [![Hugging Face Models](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-blue)](https://huggingface.co/jychen9811/FaithDiff) | [Junyang Chen, Jinshan Pan, Jiangxin Dong, IMAG Lab, (Adapted by Eliseu Silva)](https://github.com/JyChen9811/FaithDiff) |
| Stable Diffusion 3 InstructPix2Pix Pipeline | Implementation of Stable Diffusion 3 InstructPix2Pix Pipeline | [Stable Diffusion 3 InstructPix2Pix Pipeline](#stable-diffusion-3-instructpix2pix-pipeline) | [![Hugging Face Models](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-blue)](https://huggingface.co/BleachNick/SD3_UltraEdit_freeform) [![Hugging Face Models](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-blue)](https://huggingface.co/CaptainZZZ/sd3-instructpix2pix) | [Jiayu Zhang](https://github.com/xduzhangjiayu) and [Haozhe Zhao](https://github.com/HaozheZhao)|
| Flux Kontext multiple images | A modified version of the `FluxKontextPipeline` that supports calling Flux Kontext with multiple reference images.| [Flux Kontext multiple input Pipeline](#flux-kontext-multiple-images) | - | [Net-Mist](https://github.com/Net-Mist) |
To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
```py
@@ -1705,6 +1705,12 @@ class FaithDiffStableDiffusionXLPipeline(
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
processing larger images.
"""
depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
deprecate(
"enable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.enable_tiling()
self.unet.denoise_encoder.enable_tiling()
@@ -1713,6 +1719,12 @@ class FaithDiffStableDiffusionXLPipeline(
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
computing decoding in one step.
"""
depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
deprecate(
"disable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.disable_tiling()
self.unet.denoise_encoder.disable_tiling()
@@ -35,6 +35,7 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
from diffusers.utils import (
USE_PEFT_BACKEND,
deprecate,
is_torch_xla_available,
logging,
replace_example_docstring,
@@ -643,6 +644,12 @@ class FluxKontextPipeline(
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
processing larger images.
"""
depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
deprecate(
"enable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.enable_tiling()
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.disable_vae_tiling
@@ -651,6 +658,12 @@ class FluxKontextPipeline(
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
computing decoding in one step.
"""
depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
deprecate(
"disable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.disable_tiling()
def preprocess_image(self, image: PipelineImageInput, _auto_resize: bool, multiple_of: int) -> torch.Tensor:
@@ -30,6 +30,7 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
from diffusers.utils import (
USE_PEFT_BACKEND,
deprecate,
is_torch_xla_available,
logging,
replace_example_docstring,
@@ -526,6 +527,12 @@ class RFInversionFluxPipeline(
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
"""
depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
deprecate(
"enable_vae_slicing",
"0.40.0",
depr_message,
)
self.vae.enable_slicing()
def disable_vae_slicing(self):
@@ -533,6 +540,12 @@ class RFInversionFluxPipeline(
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
computing decoding in one step.
"""
depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
deprecate(
"disable_vae_slicing",
"0.40.0",
depr_message,
)
self.vae.disable_slicing()
def enable_vae_tiling(self):
@@ -541,6 +554,12 @@ class RFInversionFluxPipeline(
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
processing larger images.
"""
depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
deprecate(
"enable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.enable_tiling()
def disable_vae_tiling(self):
@@ -548,6 +567,12 @@ class RFInversionFluxPipeline(
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
computing decoding in one step.
"""
depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
deprecate(
"disable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.disable_tiling()
def prepare_latents_inversion(
@@ -35,6 +35,7 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
from diffusers.utils import (
USE_PEFT_BACKEND,
deprecate,
is_torch_xla_available,
logging,
replace_example_docstring,
@@ -702,6 +703,12 @@ class FluxSemanticGuidancePipeline(
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
processing larger images.
"""
depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
deprecate(
"enable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.enable_tiling()
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.disable_vae_tiling
@@ -710,6 +717,12 @@ class FluxSemanticGuidancePipeline(
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
computing decoding in one step.
"""
depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
deprecate(
"disable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.disable_tiling()
# Copied from diffusers.pipelines.flux.pipeline_flux.FluxPipeline.prepare_latents
@@ -28,6 +28,7 @@ from diffusers.pipelines.pipeline_utils import DiffusionPipeline
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
from diffusers.utils import (
USE_PEFT_BACKEND,
deprecate,
is_torch_xla_available,
logging,
replace_example_docstring,
@@ -503,6 +504,12 @@ class FluxCFGPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixi
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
"""
depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
deprecate(
"enable_vae_slicing",
"0.40.0",
depr_message,
)
self.vae.enable_slicing()
def disable_vae_slicing(self):
@@ -510,6 +517,12 @@ class FluxCFGPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixi
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
computing decoding in one step.
"""
depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
deprecate(
"disable_vae_slicing",
"0.40.0",
depr_message,
)
self.vae.disable_slicing()
def enable_vae_tiling(self):
@@ -518,6 +531,12 @@ class FluxCFGPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixi
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
processing larger images.
"""
depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
deprecate(
"enable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.enable_tiling()
def disable_vae_tiling(self):
@@ -525,6 +544,12 @@ class FluxCFGPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixi
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
computing decoding in one step.
"""
depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
deprecate(
"disable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.disable_tiling()
def prepare_latents(
@@ -29,11 +29,7 @@ from diffusers.models.transformers import SD3Transformer2DModel
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
from diffusers.pipelines.stable_diffusion_3.pipeline_output import StableDiffusion3PipelineOutput
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
from diffusers.utils import (
is_torch_xla_available,
logging,
replace_example_docstring,
)
from diffusers.utils import is_torch_xla_available, logging, replace_example_docstring
from diffusers.utils.torch_utils import randn_tensor
@@ -504,6 +504,12 @@ class StableDiffusionBoxDiffPipeline(
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
"""
depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
deprecate(
"enable_vae_slicing",
"0.40.0",
depr_message,
)
self.vae.enable_slicing()
def disable_vae_slicing(self):
@@ -511,6 +517,12 @@ class StableDiffusionBoxDiffPipeline(
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
computing decoding in one step.
"""
depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
deprecate(
"disable_vae_slicing",
"0.40.0",
depr_message,
)
self.vae.disable_slicing()
def enable_vae_tiling(self):
@@ -519,6 +531,12 @@ class StableDiffusionBoxDiffPipeline(
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
processing larger images.
"""
depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
deprecate(
"enable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.enable_tiling()
def disable_vae_tiling(self):
@@ -526,6 +544,12 @@ class StableDiffusionBoxDiffPipeline(
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
computing decoding in one step.
"""
depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
deprecate(
"disable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.disable_tiling()
def _encode_prompt(
@@ -471,6 +471,12 @@ class StableDiffusionPAGPipeline(
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
"""
depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
deprecate(
"enable_vae_slicing",
"0.40.0",
depr_message,
)
self.vae.enable_slicing()
def disable_vae_slicing(self):
@@ -478,6 +484,12 @@ class StableDiffusionPAGPipeline(
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
computing decoding in one step.
"""
depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
deprecate(
"disable_vae_slicing",
"0.40.0",
depr_message,
)
self.vae.disable_slicing()
def enable_vae_tiling(self):
@@ -486,6 +498,12 @@ class StableDiffusionPAGPipeline(
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
processing larger images.
"""
depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
deprecate(
"enable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.enable_tiling()
def disable_vae_tiling(self):
@@ -493,6 +511,12 @@ class StableDiffusionPAGPipeline(
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
computing decoding in one step.
"""
depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
deprecate(
"disable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.disable_tiling()
def _encode_prompt(
@@ -26,7 +26,7 @@ from diffusers.models import AutoencoderKLHunyuanVideo, HunyuanVideoTransformer3
from diffusers.pipelines.hunyuan_video.pipeline_output import HunyuanVideoPipelineOutput
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
from diffusers.utils import is_torch_xla_available, logging, replace_example_docstring
from diffusers.utils import deprecate, is_torch_xla_available, logging, replace_example_docstring
from diffusers.utils.torch_utils import randn_tensor
from diffusers.video_processor import VideoProcessor
@@ -481,6 +481,12 @@ class HunyuanVideoSTGPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMixin):
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
"""
depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
deprecate(
"enable_vae_slicing",
"0.40.0",
depr_message,
)
self.vae.enable_slicing()
def disable_vae_slicing(self):
@@ -488,6 +494,12 @@ class HunyuanVideoSTGPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMixin):
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
computing decoding in one step.
"""
depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
deprecate(
"disable_vae_slicing",
"0.40.0",
depr_message,
)
self.vae.disable_slicing()
def enable_vae_tiling(self):
@@ -496,6 +508,12 @@ class HunyuanVideoSTGPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMixin):
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
processing larger images.
"""
depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
deprecate(
"enable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.enable_tiling()
def disable_vae_tiling(self):
@@ -503,6 +521,12 @@ class HunyuanVideoSTGPipeline(DiffusionPipeline, HunyuanVideoLoraLoaderMixin):
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
computing decoding in one step.
"""
depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
deprecate(
"disable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.disable_tiling()
@property
+25 -5
View File
@@ -26,11 +26,7 @@ from diffusers.models import AutoencoderKLMochi, MochiTransformer3DModel
from diffusers.pipelines.mochi.pipeline_output import MochiPipelineOutput
from diffusers.pipelines.pipeline_utils import DiffusionPipeline
from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
from diffusers.utils import (
is_torch_xla_available,
logging,
replace_example_docstring,
)
from diffusers.utils import deprecate, is_torch_xla_available, logging, replace_example_docstring
from diffusers.utils.torch_utils import randn_tensor
from diffusers.video_processor import VideoProcessor
@@ -458,6 +454,12 @@ class MochiSTGPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin):
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
"""
depr_message = f"Calling `enable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_slicing()`."
deprecate(
"enable_vae_slicing",
"0.40.0",
depr_message,
)
self.vae.enable_slicing()
def disable_vae_slicing(self):
@@ -465,6 +467,12 @@ class MochiSTGPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin):
Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
computing decoding in one step.
"""
depr_message = f"Calling `disable_vae_slicing()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_slicing()`."
deprecate(
"disable_vae_slicing",
"0.40.0",
depr_message,
)
self.vae.disable_slicing()
def enable_vae_tiling(self):
@@ -473,6 +481,12 @@ class MochiSTGPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin):
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
processing larger images.
"""
depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
deprecate(
"enable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.enable_tiling()
def disable_vae_tiling(self):
@@ -480,6 +494,12 @@ class MochiSTGPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin):
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
computing decoding in one step.
"""
depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
deprecate(
"disable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.disable_tiling()
def prepare_latents(
+7 -2
View File
@@ -25,6 +25,11 @@ from os.path import abspath, dirname, join
git_repo_path = abspath(join(dirname(dirname(dirname(__file__))), "src"))
sys.path.insert(1, git_repo_path)
# Add parent directory to path so we can import from tests
repo_root = abspath(dirname(dirname(__file__)))
if repo_root not in sys.path:
sys.path.insert(0, repo_root)
# silence FutureWarning warnings in tests since often we can't act on them until
# they become normal warnings - i.e. the tests still need to test the current functionality
@@ -32,13 +37,13 @@ warnings.simplefilter(action="ignore", category=FutureWarning)
def pytest_addoption(parser):
from diffusers.utils.testing_utils import pytest_addoption_shared
from tests.testing_utils import pytest_addoption_shared
pytest_addoption_shared(parser)
def pytest_terminal_summary(terminalreporter):
from diffusers.utils.testing_utils import pytest_terminal_summary_main
from tests.testing_utils import pytest_terminal_summary_main
make_reports = terminalreporter.config.getoption("--make-reports")
if make_reports:
+3 -2
View File
@@ -24,6 +24,8 @@ import math
import os
import random
import shutil
# Add repo root to path to import from tests
from pathlib import Path
import accelerate
@@ -54,8 +56,7 @@ from diffusers.optimization import get_scheduler
from diffusers.training_utils import compute_density_for_timestep_sampling, compute_loss_weighting_for_sd3, free_memory
from diffusers.utils import check_min_version, is_wandb_available, make_image_grid
from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
from diffusers.utils.testing_utils import backend_empty_cache
from diffusers.utils.torch_utils import is_compiled_module
from diffusers.utils.torch_utils import backend_empty_cache, is_compiled_module
if is_wandb_available():
@@ -1131,6 +1131,7 @@ def main(args):
torch_dtype = torch.float16
elif args.prior_generation_precision == "bf16":
torch_dtype = torch.bfloat16
pipeline = FluxPipeline.from_pretrained(
args.pretrained_model_name_or_path,
torch_dtype=torch_dtype,
@@ -1151,7 +1152,8 @@ def main(args):
for example in tqdm(
sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
):
images = pipeline(example["prompt"]).images
with torch.autocast(device_type=accelerator.device.type, dtype=torch_dtype):
images = pipeline(prompt=example["prompt"]).images
for i, image in enumerate(images):
hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
@@ -1159,8 +1161,7 @@ def main(args):
image.save(image_filename)
del pipeline
if torch.cuda.is_available():
torch.cuda.empty_cache()
free_memory()
# Handle the repository creation
if accelerator.is_main_process:
@@ -1728,6 +1729,10 @@ def main(args):
device=accelerator.device,
prompt=args.instance_prompt,
)
else:
prompt_embeds, pooled_prompt_embeds, text_ids = compute_text_embeddings(
prompts, text_encoders, tokenizers
)
# Convert images to latent space
if args.cache_latents:
@@ -29,8 +29,9 @@ from pathlib import Path
import numpy as np
import torch
import transformers
from accelerate import Accelerator
from accelerate import Accelerator, DistributedType
from accelerate.logging import get_logger
from accelerate.state import AcceleratorState
from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
from huggingface_hub import create_repo, upload_folder
from huggingface_hub.utils import insecure_hashlib
@@ -1222,6 +1223,9 @@ def main(args):
kwargs_handlers=[kwargs],
)
if accelerator.distributed_type == DistributedType.DEEPSPEED:
AcceleratorState().deepspeed_plugin.deepspeed_config["train_micro_batch_size_per_gpu"] = args.train_batch_size
# Disable AMP for MPS.
if torch.backends.mps.is_available():
accelerator.native_amp = False
@@ -1270,6 +1274,7 @@ def main(args):
subfolder="transformer",
revision=args.revision,
variant=args.variant,
torch_dtype=torch_dtype,
)
pipeline = FluxKontextPipeline.from_pretrained(
args.pretrained_model_name_or_path,
@@ -1292,7 +1297,8 @@ def main(args):
for example in tqdm(
sample_dataloader, desc="Generating class images", disable=not accelerator.is_local_main_process
):
images = pipeline(example["prompt"]).images
with torch.autocast(device_type=accelerator.device.type, dtype=torch_dtype):
images = pipeline(prompt=example["prompt"]).images
for i, image in enumerate(images):
hash_image = insecure_hashlib.sha1(image.tobytes()).hexdigest()
@@ -1436,17 +1442,20 @@ def main(args):
text_encoder_one_lora_layers_to_save = None
modules_to_save = {}
for model in models:
if isinstance(model, type(unwrap_model(transformer))):
if isinstance(unwrap_model(model), type(unwrap_model(transformer))):
model = unwrap_model(model)
transformer_lora_layers_to_save = get_peft_model_state_dict(model)
modules_to_save["transformer"] = model
elif isinstance(model, type(unwrap_model(text_encoder_one))):
elif isinstance(unwrap_model(model), type(unwrap_model(text_encoder_one))):
model = unwrap_model(model)
text_encoder_one_lora_layers_to_save = get_peft_model_state_dict(model)
modules_to_save["text_encoder"] = model
else:
raise ValueError(f"unexpected save model: {model.__class__}")
# make sure to pop weight so that corresponding model is not saved again
weights.pop()
if weights:
weights.pop()
FluxKontextPipeline.save_lora_weights(
output_dir,
@@ -1459,15 +1468,25 @@ def main(args):
transformer_ = None
text_encoder_one_ = None
while len(models) > 0:
model = models.pop()
if not accelerator.distributed_type == DistributedType.DEEPSPEED:
while len(models) > 0:
model = models.pop()
if isinstance(model, type(unwrap_model(transformer))):
transformer_ = model
elif isinstance(model, type(unwrap_model(text_encoder_one))):
text_encoder_one_ = model
else:
raise ValueError(f"unexpected save model: {model.__class__}")
if isinstance(unwrap_model(model), type(unwrap_model(transformer))):
transformer_ = unwrap_model(model)
elif isinstance(unwrap_model(model), type(unwrap_model(text_encoder_one))):
text_encoder_one_ = unwrap_model(model)
else:
raise ValueError(f"unexpected save model: {model.__class__}")
else:
transformer_ = FluxTransformer2DModel.from_pretrained(
args.pretrained_model_name_or_path, subfolder="transformer"
)
transformer_.add_adapter(transformer_lora_config)
text_encoder_one_ = text_encoder_cls_one.from_pretrained(
args.pretrained_model_name_or_path, subfolder="text_encoder"
)
lora_state_dict = FluxKontextPipeline.lora_state_dict(input_dir)
@@ -1899,6 +1918,10 @@ def main(args):
device=accelerator.device,
prompt=args.instance_prompt,
)
else:
prompt_embeds, pooled_prompt_embeds, text_ids = compute_text_embeddings(
prompts, text_encoders, tokenizers
)
# Convert images to latent space
if args.cache_latents:
@@ -2063,7 +2086,7 @@ def main(args):
progress_bar.update(1)
global_step += 1
if accelerator.is_main_process:
if accelerator.is_main_process or accelerator.distributed_type == DistributedType.DEEPSPEED:
if global_step % args.checkpointing_steps == 0:
# _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
if args.checkpoints_total_limit is not None:
@@ -1760,7 +1760,7 @@
"clip_local = None\n",
"clip_pos = None\n",
"\n",
"# constands for data handling\n",
"# constants for data handling\n",
"save_traj = False\n",
"save_data = False\n",
"output_dir = \"/content/\""
@@ -2,7 +2,7 @@
Please note that this project is not actively maintained. However, you can open an issue and tag @gzguevara.
[DreamBooth](https://huggingface.co/papers/2208.12242) is a method to personalize text2image models like stable diffusion given just a few(3~5) images of a subject. This project consists of **two parts**. Training Stable Diffusion for inpainting requieres prompt-image-mask pairs. The Unet of inpainiting models have 5 additional input channels (4 for the encoded masked-image and 1 for the mask itself).
[DreamBooth](https://huggingface.co/papers/2208.12242) is a method to personalize text2image models like stable diffusion given just a few(3~5) images of a subject. This project consists of **two parts**. Training Stable Diffusion for inpainting requires prompt-image-mask pairs. The Unet of inpainiting models have 5 additional input channels (4 for the encoded masked-image and 1 for the mask itself).
**The first part**, the `multi_inpaint_dataset.ipynb` notebook, demonstrates how make a 🤗 dataset of prompt-image-mask pairs. You can, however, skip the first part and move straight to the second part with the example datasets in this project. ([cat toy dataset masked](https://huggingface.co/datasets/gzguevara/cat_toy_masked), [mr. potato head dataset masked](https://huggingface.co/datasets/gzguevara/mr_potato_head_masked))
@@ -263,6 +263,12 @@ class PromptDiffusionPipeline(
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
processing larger images.
"""
depr_message = f"Calling `enable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.enable_tiling()`."
deprecate(
"enable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.enable_tiling()
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.disable_vae_tiling
@@ -271,6 +277,12 @@ class PromptDiffusionPipeline(
Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
computing decoding in one step.
"""
depr_message = f"Calling `disable_vae_tiling()` on a `{self.__class__.__name__}` is deprecated and this method will be removed in a future version. Please use `pipe.vae.disable_tiling()`."
deprecate(
"disable_vae_tiling",
"0.40.0",
depr_message,
)
self.vae.disable_tiling()
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt
+7 -1
View File
@@ -24,12 +24,18 @@ import tempfile
import torch
from diffusers import VQModel
from diffusers.utils.testing_utils import require_timm
# Add parent directories to path to import from tests
sys.path.append("..")
repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
if repo_root not in sys.path:
sys.path.insert(0, repo_root)
from test_examples_utils import ExamplesTestsAccelerate, run_command # noqa: E402
from tests.testing_utils import require_timm # noqa
logging.basicConfig(level=logging.DEBUG)
+2
View File
@@ -131,6 +131,7 @@ _deps = [
"gguf>=0.10.0",
"torchao>=0.7.0",
"bitsandbytes>=0.43.3",
"nvidia_modelopt[hf]>=0.33.1",
"regex!=2019.12.17",
"requests",
"tensorboard",
@@ -244,6 +245,7 @@ extras["bitsandbytes"] = deps_list("bitsandbytes", "accelerate")
extras["gguf"] = deps_list("gguf", "accelerate")
extras["optimum_quanto"] = deps_list("optimum_quanto", "accelerate")
extras["torchao"] = deps_list("torchao", "accelerate")
extras["nvidia_modelopt"] = deps_list("nvidia_modelopt[hf]")
if os.name == "nt": # windows
extras["flax"] = [] # jax is not supported on windows
+33
View File
@@ -13,6 +13,7 @@ from .utils import (
is_k_diffusion_available,
is_librosa_available,
is_note_seq_available,
is_nvidia_modelopt_available,
is_onnx_available,
is_opencv_available,
is_optimum_quanto_available,
@@ -111,6 +112,18 @@ except OptionalDependencyNotAvailable:
else:
_import_structure["quantizers.quantization_config"].append("QuantoConfig")
try:
if not is_torch_available() and not is_accelerate_available() and not is_nvidia_modelopt_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from .utils import dummy_nvidia_modelopt_objects
_import_structure["utils.dummy_nvidia_modelopt_objects"] = [
name for name in dir(dummy_nvidia_modelopt_objects) if not name.startswith("_")
]
else:
_import_structure["quantizers.quantization_config"].append("NVIDIAModelOptConfig")
try:
if not is_onnx_available():
raise OptionalDependencyNotAvailable()
@@ -372,6 +385,10 @@ else:
[
"FluxAutoBlocks",
"FluxModularPipeline",
"QwenImageAutoBlocks",
"QwenImageEditAutoBlocks",
"QwenImageEditModularPipeline",
"QwenImageModularPipeline",
"StableDiffusionXLAutoBlocks",
"StableDiffusionXLModularPipeline",
"WanAutoBlocks",
@@ -493,7 +510,9 @@ else:
"PixArtAlphaPipeline",
"PixArtSigmaPAGPipeline",
"PixArtSigmaPipeline",
"QwenImageControlNetInpaintPipeline",
"QwenImageControlNetPipeline",
"QwenImageEditInpaintPipeline",
"QwenImageEditPipeline",
"QwenImageImg2ImgPipeline",
"QwenImageInpaintPipeline",
@@ -794,6 +813,14 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
else:
from .quantizers.quantization_config import QuantoConfig
try:
if not is_nvidia_modelopt_available():
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from .utils.dummy_nvidia_modelopt_objects import *
else:
from .quantizers.quantization_config import NVIDIAModelOptConfig
try:
if not is_onnx_available():
raise OptionalDependencyNotAvailable()
@@ -1016,6 +1043,10 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
from .modular_pipelines import (
FluxAutoBlocks,
FluxModularPipeline,
QwenImageAutoBlocks,
QwenImageEditAutoBlocks,
QwenImageEditModularPipeline,
QwenImageModularPipeline,
StableDiffusionXLAutoBlocks,
StableDiffusionXLModularPipeline,
WanAutoBlocks,
@@ -1133,7 +1164,9 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
PixArtAlphaPipeline,
PixArtSigmaPAGPipeline,
PixArtSigmaPipeline,
QwenImageControlNetInpaintPipeline,
QwenImageControlNetPipeline,
QwenImageEditInpaintPipeline,
QwenImageEditPipeline,
QwenImageImg2ImgPipeline,
QwenImageInpaintPipeline,
@@ -38,6 +38,7 @@ deps = {
"gguf": "gguf>=0.10.0",
"torchao": "torchao>=0.7.0",
"bitsandbytes": "bitsandbytes>=0.43.3",
"nvidia_modelopt[hf]": "nvidia_modelopt[hf]>=0.33.1",
"regex": "regex!=2019.12.17",
"requests": "requests",
"tensorboard": "tensorboard",
+6 -6
View File
@@ -82,15 +82,15 @@ class AutoGuidance(BaseGuidance):
self.guidance_rescale = guidance_rescale
self.use_original_formulation = use_original_formulation
if auto_guidance_layers is None and auto_guidance_config is None:
is_layer_or_config_provided = auto_guidance_layers is not None or auto_guidance_config is not None
is_layer_and_config_provided = auto_guidance_layers is not None and auto_guidance_config is not None
if not is_layer_or_config_provided:
raise ValueError(
"Either `auto_guidance_layers` or `auto_guidance_config` must be provided to enable Skip Layer Guidance."
"Either `auto_guidance_layers` or `auto_guidance_config` must be provided to enable AutoGuidance."
)
if auto_guidance_layers is not None and auto_guidance_config is not None:
if is_layer_and_config_provided:
raise ValueError("Only one of `auto_guidance_layers` or `auto_guidance_config` can be provided.")
if (dropout is None and auto_guidance_layers is not None) or (
dropout is not None and auto_guidance_layers is None
):
if auto_guidance_config is None and dropout is None:
raise ValueError("`dropout` must be provided if `auto_guidance_layers` is provided.")
if auto_guidance_layers is not None:
@@ -61,7 +61,7 @@ def project(v0: torch.Tensor, v1: torch.Tensor, upcast_to_double: bool = True) -
def build_image_from_pyramid(pyramid: List[torch.Tensor]) -> torch.Tensor:
"""
Recovers the data space latents from the Laplacian pyramid frequency space. Implementation from the paper
(Algorihtm 2).
(Algorithm 2).
"""
# pyramid shapes: [[B, C, H, W], [B, C, H/2, W/2], ...]
img = pyramid[-1]
+10
View File
@@ -108,6 +108,7 @@ def _register_attention_processors_metadata():
from ..models.attention_processor import AttnProcessor2_0
from ..models.transformers.transformer_cogview4 import CogView4AttnProcessor
from ..models.transformers.transformer_flux import FluxAttnProcessor
from ..models.transformers.transformer_qwenimage import QwenDoubleStreamAttnProcessor2_0
from ..models.transformers.transformer_wan import WanAttnProcessor2_0
# AttnProcessor2_0
@@ -140,6 +141,14 @@ def _register_attention_processors_metadata():
metadata=AttentionProcessorMetadata(skip_processor_output_fn=_skip_proc_output_fn_Attention_FluxAttnProcessor),
)
# QwenDoubleStreamAttnProcessor2
AttentionProcessorRegistry.register(
model_class=QwenDoubleStreamAttnProcessor2_0,
metadata=AttentionProcessorMetadata(
skip_processor_output_fn=_skip_proc_output_fn_Attention_QwenDoubleStreamAttnProcessor2_0
),
)
def _register_transformer_blocks_metadata():
from ..models.attention import BasicTransformerBlock
@@ -298,4 +307,5 @@ _skip_proc_output_fn_Attention_CogView4AttnProcessor = _skip_attention___ret___h
_skip_proc_output_fn_Attention_WanAttnProcessor2_0 = _skip_attention___ret___hidden_states
# not sure what this is yet.
_skip_proc_output_fn_Attention_FluxAttnProcessor = _skip_attention___ret___hidden_states
_skip_proc_output_fn_Attention_QwenDoubleStreamAttnProcessor2_0 = _skip_attention___ret___hidden_states
# fmt: on
+3 -3
View File
@@ -54,11 +54,11 @@ class FasterCacheConfig:
Attributes:
spatial_attention_block_skip_range (`int`, defaults to `2`):
Calculate the attention states every `N` iterations. If this is set to `N`, the attention computation will
be skipped `N - 1` times (i.e., cached attention states will be re-used) before computing the new attention
be skipped `N - 1` times (i.e., cached attention states will be reused) before computing the new attention
states again.
temporal_attention_block_skip_range (`int`, *optional*, defaults to `None`):
Calculate the attention states every `N` iterations. If this is set to `N`, the attention computation will
be skipped `N - 1` times (i.e., cached attention states will be re-used) before computing the new attention
be skipped `N - 1` times (i.e., cached attention states will be reused) before computing the new attention
states again.
spatial_attention_timestep_skip_range (`Tuple[float, float]`, defaults to `(-1, 681)`):
The timestep range within which the spatial attention computation can be skipped without a significant loss
@@ -90,7 +90,7 @@ class FasterCacheConfig:
from the conditional branch outputs.
unconditional_batch_skip_range (`int`, defaults to `5`):
Process the unconditional branch every `N` iterations. If this is set to `N`, the unconditional branch
computation will be skipped `N - 1` times (i.e., cached unconditional branch states will be re-used) before
computation will be skipped `N - 1` times (i.e., cached unconditional branch states will be reused) before
computing the new unconditional branch states again.
unconditional_batch_timestep_skip_range (`Tuple[float, float]`, defaults to `(-1, 641)`):
The timestep range within which the unconditional branch computation can be skipped without a significant
@@ -45,15 +45,15 @@ class PyramidAttentionBroadcastConfig:
spatial_attention_block_skip_range (`int`, *optional*, defaults to `None`):
The number of times a specific spatial attention broadcast is skipped before computing the attention states
to re-use. If this is set to the value `N`, the attention computation will be skipped `N - 1` times (i.e.,
old attention states will be re-used) before computing the new attention states again.
old attention states will be reused) before computing the new attention states again.
temporal_attention_block_skip_range (`int`, *optional*, defaults to `None`):
The number of times a specific temporal attention broadcast is skipped before computing the attention
states to re-use. If this is set to the value `N`, the attention computation will be skipped `N - 1` times
(i.e., old attention states will be re-used) before computing the new attention states again.
(i.e., old attention states will be reused) before computing the new attention states again.
cross_attention_block_skip_range (`int`, *optional*, defaults to `None`):
The number of times a specific cross-attention broadcast is skipped before computing the attention states
to re-use. If this is set to the value `N`, the attention computation will be skipped `N - 1` times (i.e.,
old attention states will be re-used) before computing the new attention states again.
old attention states will be reused) before computing the new attention states again.
spatial_attention_timestep_skip_range (`Tuple[int, int]`, defaults to `(100, 800)`):
The range of timesteps to skip in the spatial attention layer. The attention computations will be
conditionally skipped if the current timestep is within the specified range.
@@ -305,7 +305,7 @@ def _apply_pyramid_attention_broadcast_hook(
block_skip_range (`int`):
The number of times a specific attention broadcast is skipped before computing the attention states to
re-use. If this is set to the value `N`, the attention computation will be skipped `N - 1` times (i.e., old
attention states will be re-used) before computing the new attention states again.
attention states will be reused) before computing the new attention states again.
current_timestep_callback (`Callable[[], int]`):
A callback function that returns the current inference timestep.
"""
+132
View File
@@ -523,6 +523,7 @@ class VaeImageProcessor(ConfigMixin):
size=(height, width),
)
image = self.pt_to_numpy(image)
return image
def binarize(self, image: PIL.Image.Image) -> PIL.Image.Image:
@@ -838,6 +839,137 @@ class VaeImageProcessor(ConfigMixin):
return image
class InpaintProcessor(ConfigMixin):
"""
Image processor for inpainting image and mask.
"""
config_name = CONFIG_NAME
@register_to_config
def __init__(
self,
do_resize: bool = True,
vae_scale_factor: int = 8,
vae_latent_channels: int = 4,
resample: str = "lanczos",
reducing_gap: int = None,
do_normalize: bool = True,
do_binarize: bool = False,
do_convert_grayscale: bool = False,
mask_do_normalize: bool = False,
mask_do_binarize: bool = True,
mask_do_convert_grayscale: bool = True,
):
super().__init__()
self._image_processor = VaeImageProcessor(
do_resize=do_resize,
vae_scale_factor=vae_scale_factor,
vae_latent_channels=vae_latent_channels,
resample=resample,
reducing_gap=reducing_gap,
do_normalize=do_normalize,
do_binarize=do_binarize,
do_convert_grayscale=do_convert_grayscale,
)
self._mask_processor = VaeImageProcessor(
do_resize=do_resize,
vae_scale_factor=vae_scale_factor,
vae_latent_channels=vae_latent_channels,
resample=resample,
reducing_gap=reducing_gap,
do_normalize=mask_do_normalize,
do_binarize=mask_do_binarize,
do_convert_grayscale=mask_do_convert_grayscale,
)
def preprocess(
self,
image: PIL.Image.Image,
mask: PIL.Image.Image = None,
height: int = None,
width: int = None,
padding_mask_crop: Optional[int] = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Preprocess the image and mask.
"""
if mask is None and padding_mask_crop is not None:
raise ValueError("mask must be provided if padding_mask_crop is provided")
# if mask is None, same behavior as regular image processor
if mask is None:
return self._image_processor.preprocess(image, height=height, width=width)
if padding_mask_crop is not None:
crops_coords = self._image_processor.get_crop_region(mask, width, height, pad=padding_mask_crop)
resize_mode = "fill"
else:
crops_coords = None
resize_mode = "default"
processed_image = self._image_processor.preprocess(
image,
height=height,
width=width,
crops_coords=crops_coords,
resize_mode=resize_mode,
)
processed_mask = self._mask_processor.preprocess(
mask,
height=height,
width=width,
resize_mode=resize_mode,
crops_coords=crops_coords,
)
if crops_coords is not None:
postprocessing_kwargs = {
"crops_coords": crops_coords,
"original_image": image,
"original_mask": mask,
}
else:
postprocessing_kwargs = {
"crops_coords": None,
"original_image": None,
"original_mask": None,
}
return processed_image, processed_mask, postprocessing_kwargs
def postprocess(
self,
image: torch.Tensor,
output_type: str = "pil",
original_image: Optional[PIL.Image.Image] = None,
original_mask: Optional[PIL.Image.Image] = None,
crops_coords: Optional[Tuple[int, int, int, int]] = None,
) -> Tuple[PIL.Image.Image, PIL.Image.Image]:
"""
Postprocess the image, optionally apply mask overlay
"""
image = self._image_processor.postprocess(
image,
output_type=output_type,
)
# optionally apply the mask overlay
if crops_coords is not None and (original_image is None or original_mask is None):
raise ValueError("original_image and original_mask must be provided if crops_coords is provided")
elif crops_coords is not None and output_type != "pil":
raise ValueError("output_type must be 'pil' if crops_coords is provided")
elif crops_coords is not None:
image = [
self._image_processor.apply_overlay(original_mask, original_image, i, crops_coords) for i in image
]
return image
class VaeImageProcessorLDM3D(VaeImageProcessor):
"""
Image processor for VAE LDM3D.
+39 -20
View File
@@ -2129,6 +2129,10 @@ def _convert_non_diffusers_ltxv_lora_to_diffusers(state_dict, non_diffusers_pref
def _convert_non_diffusers_qwen_lora_to_diffusers(state_dict):
has_diffusion_model = any(k.startswith("diffusion_model.") for k in state_dict)
if has_diffusion_model:
state_dict = {k.removeprefix("diffusion_model."): v for k, v in state_dict.items()}
has_lora_unet = any(k.startswith("lora_unet_") for k in state_dict)
if has_lora_unet:
state_dict = {k.removeprefix("lora_unet_"): v for k, v in state_dict.items()}
@@ -2201,29 +2205,44 @@ def _convert_non_diffusers_qwen_lora_to_diffusers(state_dict):
all_keys = list(state_dict.keys())
down_key = ".lora_down.weight"
up_key = ".lora_up.weight"
a_key = ".lora_A.weight"
b_key = ".lora_B.weight"
def get_alpha_scales(down_weight, alpha_key):
rank = down_weight.shape[0]
alpha = state_dict.pop(alpha_key).item()
scale = alpha / rank # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
scale_down = scale
scale_up = 1.0
while scale_down * 2 < scale_up:
scale_down *= 2
scale_up /= 2
return scale_down, scale_up
has_non_diffusers_lora_id = any(down_key in k or up_key in k for k in all_keys)
has_diffusers_lora_id = any(a_key in k or b_key in k for k in all_keys)
for k in all_keys:
if k.endswith(down_key):
diffusers_down_key = k.replace(down_key, ".lora_A.weight")
diffusers_up_key = k.replace(down_key, up_key).replace(up_key, ".lora_B.weight")
alpha_key = k.replace(down_key, ".alpha")
if has_non_diffusers_lora_id:
down_weight = state_dict.pop(k)
up_weight = state_dict.pop(k.replace(down_key, up_key))
scale_down, scale_up = get_alpha_scales(down_weight, alpha_key)
converted_state_dict[diffusers_down_key] = down_weight * scale_down
converted_state_dict[diffusers_up_key] = up_weight * scale_up
def get_alpha_scales(down_weight, alpha_key):
rank = down_weight.shape[0]
alpha = state_dict.pop(alpha_key).item()
scale = alpha / rank # LoRA is scaled by 'alpha / rank' in forward pass, so we need to scale it back here
scale_down = scale
scale_up = 1.0
while scale_down * 2 < scale_up:
scale_down *= 2
scale_up /= 2
return scale_down, scale_up
for k in all_keys:
if k.endswith(down_key):
diffusers_down_key = k.replace(down_key, ".lora_A.weight")
diffusers_up_key = k.replace(down_key, up_key).replace(up_key, ".lora_B.weight")
alpha_key = k.replace(down_key, ".alpha")
down_weight = state_dict.pop(k)
up_weight = state_dict.pop(k.replace(down_key, up_key))
scale_down, scale_up = get_alpha_scales(down_weight, alpha_key)
converted_state_dict[diffusers_down_key] = down_weight * scale_down
converted_state_dict[diffusers_up_key] = up_weight * scale_up
# Already in diffusers format (lora_A/lora_B), just pop
elif has_diffusers_lora_id:
for k in all_keys:
if a_key in k or b_key in k:
converted_state_dict[k] = state_dict.pop(k)
elif ".alpha" in k:
state_dict.pop(k)
if len(state_dict) > 0:
raise ValueError(f"`state_dict` should be empty at this point but has {state_dict.keys()=}")
+2 -1
View File
@@ -6684,7 +6684,8 @@ class QwenImageLoraLoaderMixin(LoraBaseMixin):
has_alphas_in_sd = any(k.endswith(".alpha") for k in state_dict)
has_lora_unet = any(k.startswith("lora_unet_") for k in state_dict)
if has_alphas_in_sd or has_lora_unet:
has_diffusion_model = any(k.startswith("diffusion_model.") for k in state_dict)
if has_alphas_in_sd or has_lora_unet or has_diffusion_model:
state_dict = _convert_non_diffusers_qwen_lora_to_diffusers(state_dict)
out = (state_dict, metadata) if return_lora_metadata else state_dict
+23 -12
View File
@@ -22,6 +22,7 @@ from huggingface_hub.utils import validate_hf_hub_args
from typing_extensions import Self
from .. import __version__
from ..models.model_loading_utils import _caching_allocator_warmup, _determine_device_map, _expand_device_map
from ..quantizers import DiffusersAutoQuantizer
from ..utils import deprecate, is_accelerate_available, is_torch_version, logging
from ..utils.torch_utils import empty_device_cache
@@ -297,6 +298,7 @@ class FromOriginalModelMixin:
low_cpu_mem_usage = kwargs.pop("low_cpu_mem_usage", _LOW_CPU_MEM_USAGE_DEFAULT)
device = kwargs.pop("device", None)
disable_mmap = kwargs.pop("disable_mmap", False)
device_map = kwargs.pop("device_map", None)
user_agent = {"diffusers": __version__, "file_type": "single_file", "framework": "pytorch"}
# In order to ensure popular quantization methods are supported. Can be disable with `disable_telemetry`
@@ -403,19 +405,8 @@ class FromOriginalModelMixin:
with ctx():
model = cls.from_config(diffusers_model_config)
checkpoint_mapping_kwargs = _get_mapping_function_kwargs(checkpoint_mapping_fn, **kwargs)
model_state_dict = model.state_dict()
if _should_convert_state_dict_to_diffusers(model.state_dict(), checkpoint):
diffusers_format_checkpoint = checkpoint_mapping_fn(
config=diffusers_model_config, checkpoint=checkpoint, **checkpoint_mapping_kwargs
)
else:
diffusers_format_checkpoint = checkpoint
if not diffusers_format_checkpoint:
raise SingleFileComponentError(
f"Failed to load {mapping_class_name}. Weights for this component appear to be missing in the checkpoint."
)
# Check if `_keep_in_fp32_modules` is not None
use_keep_in_fp32_modules = (cls._keep_in_fp32_modules is not None) and (
(torch_dtype == torch.float16) or hasattr(hf_quantizer, "use_keep_in_fp32_modules")
@@ -428,6 +419,26 @@ class FromOriginalModelMixin:
else:
keep_in_fp32_modules = []
# Now that the model is loaded, we can determine the `device_map`
device_map = _determine_device_map(model, device_map, None, torch_dtype, keep_in_fp32_modules, hf_quantizer)
if device_map is not None:
expanded_device_map = _expand_device_map(device_map, model_state_dict.keys())
_caching_allocator_warmup(model, expanded_device_map, torch_dtype, hf_quantizer)
checkpoint_mapping_kwargs = _get_mapping_function_kwargs(checkpoint_mapping_fn, **kwargs)
if _should_convert_state_dict_to_diffusers(model_state_dict, checkpoint):
diffusers_format_checkpoint = checkpoint_mapping_fn(
config=diffusers_model_config, checkpoint=checkpoint, **checkpoint_mapping_kwargs
)
else:
diffusers_format_checkpoint = checkpoint
if not diffusers_format_checkpoint:
raise SingleFileComponentError(
f"Failed to load {mapping_class_name}. Weights for this component appear to be missing in the checkpoint."
)
if hf_quantizer is not None:
hf_quantizer.preprocess_model(
model=model,
+70 -4
View File
@@ -26,6 +26,7 @@ from ..utils import (
is_flash_attn_3_available,
is_flash_attn_available,
is_flash_attn_version,
is_kernels_available,
is_sageattention_available,
is_sageattention_version,
is_torch_npu_available,
@@ -35,7 +36,7 @@ from ..utils import (
is_xformers_available,
is_xformers_version,
)
from ..utils.constants import DIFFUSERS_ATTN_BACKEND, DIFFUSERS_ATTN_CHECKS
from ..utils.constants import DIFFUSERS_ATTN_BACKEND, DIFFUSERS_ATTN_CHECKS, DIFFUSERS_ENABLE_HUB_KERNELS
_REQUIRED_FLASH_VERSION = "2.6.3"
@@ -67,6 +68,17 @@ else:
flash_attn_3_func = None
flash_attn_3_varlen_func = None
if DIFFUSERS_ENABLE_HUB_KERNELS:
if not is_kernels_available():
raise ImportError(
"To use FA3 kernel for your hardware from the Hub, the `kernels` library must be installed. Install with `pip install kernels`."
)
from ..utils.kernels_utils import _get_fa3_from_hub
flash_attn_interface_hub = _get_fa3_from_hub()
flash_attn_3_func_hub = flash_attn_interface_hub.flash_attn_func
else:
flash_attn_3_func_hub = None
if _CAN_USE_SAGE_ATTN:
from sageattention import (
@@ -153,6 +165,8 @@ class AttentionBackendName(str, Enum):
FLASH_VARLEN = "flash_varlen"
_FLASH_3 = "_flash_3"
_FLASH_VARLEN_3 = "_flash_varlen_3"
_FLASH_3_HUB = "_flash_3_hub"
# _FLASH_VARLEN_3_HUB = "_flash_varlen_3_hub" # not supported yet.
# PyTorch native
FLEX = "flex"
@@ -351,6 +365,17 @@ def _check_attention_backend_requirements(backend: AttentionBackendName) -> None
f"Flash Attention 3 backend '{backend.value}' is not usable because of missing package or the version is too old. Please build FA3 beta release from source."
)
# TODO: add support Hub variant of FA3 varlen later
elif backend in [AttentionBackendName._FLASH_3_HUB]:
if not DIFFUSERS_ENABLE_HUB_KERNELS:
raise RuntimeError(
f"Flash Attention 3 Hub backend '{backend.value}' is not usable because the `DIFFUSERS_ENABLE_HUB_KERNELS` env var isn't set. Please set it like `export DIFFUSERS_ENABLE_HUB_KERNELS=yes`."
)
if not is_kernels_available():
raise RuntimeError(
f"Flash Attention 3 Hub backend '{backend.value}' is not usable because the `kernels` package isn't available. Please install it with `pip install kernels`."
)
elif backend in [
AttentionBackendName.SAGE,
AttentionBackendName.SAGE_VARLEN,
@@ -657,6 +682,44 @@ def _flash_attention_3(
return (out, lse) if return_attn_probs else out
@_AttentionBackendRegistry.register(
AttentionBackendName._FLASH_3_HUB,
constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
)
def _flash_attention_3_hub(
query: torch.Tensor,
key: torch.Tensor,
value: torch.Tensor,
scale: Optional[float] = None,
is_causal: bool = False,
window_size: Tuple[int, int] = (-1, -1),
softcap: float = 0.0,
deterministic: bool = False,
return_attn_probs: bool = False,
) -> torch.Tensor:
out = flash_attn_3_func_hub(
q=query,
k=key,
v=value,
softmax_scale=scale,
causal=is_causal,
qv=None,
q_descale=None,
k_descale=None,
v_descale=None,
window_size=window_size,
softcap=softcap,
num_splits=1,
pack_gqa=None,
deterministic=deterministic,
sm_margin=0,
return_attn_probs=return_attn_probs,
)
# When `return_attn_probs` is True, the above returns a tuple of
# actual outputs and lse.
return (out[0], out[1]) if return_attn_probs else out
@_AttentionBackendRegistry.register(
AttentionBackendName._FLASH_VARLEN_3,
constraints=[_check_device, _check_qkv_dtype_bf16_or_fp16, _check_shape],
@@ -955,12 +1018,13 @@ def _native_npu_attention(
dropout_p: float = 0.0,
scale: Optional[float] = None,
) -> torch.Tensor:
return npu_fusion_attention(
query, key, value = (x.transpose(1, 2).contiguous() for x in (query, key, value))
out = npu_fusion_attention(
query,
key,
value,
query.size(2), # num_heads
input_layout="BSND",
query.size(1), # num_heads
input_layout="BNSD",
pse=None,
scale=1.0 / math.sqrt(query.shape[-1]) if scale is None else scale,
pre_tockens=65536,
@@ -969,6 +1033,8 @@ def _native_npu_attention(
sync=False,
inner_precise=0,
)[0]
out = out.transpose(1, 2).contiguous()
return out
# Reference: https://github.com/pytorch/xla/blob/06c5533de6588f6b90aa1655d9850bcf733b90b4/torch_xla/experimental/custom_kernel.py#L853
@@ -21,7 +21,7 @@ import torch.nn as nn
from ...configuration_utils import ConfigMixin, register_to_config
from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
from ...utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
from ..attention import FeedForward
from ..attention import AttentionMixin, FeedForward
from ..cache_utils import CacheMixin
from ..modeling_outputs import Transformer2DModelOutput
from ..modeling_utils import ModelMixin
@@ -134,7 +134,9 @@ class WanVACETransformerBlock(nn.Module):
return conditioning_states, control_hidden_states
class WanVACETransformer3DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin):
class WanVACETransformer3DModel(
ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin, AttentionMixin
):
r"""
A Transformer model for video-like data used in the Wan model.
@@ -47,6 +47,12 @@ else:
_import_structure["stable_diffusion_xl"] = ["StableDiffusionXLAutoBlocks", "StableDiffusionXLModularPipeline"]
_import_structure["wan"] = ["WanAutoBlocks", "WanModularPipeline"]
_import_structure["flux"] = ["FluxAutoBlocks", "FluxModularPipeline"]
_import_structure["qwenimage"] = [
"QwenImageAutoBlocks",
"QwenImageModularPipeline",
"QwenImageEditModularPipeline",
"QwenImageEditAutoBlocks",
]
_import_structure["components_manager"] = ["ComponentsManager"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
@@ -68,6 +74,12 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
SequentialPipelineBlocks,
)
from .modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, InsertableDict, OutputParam
from .qwenimage import (
QwenImageAutoBlocks,
QwenImageEditAutoBlocks,
QwenImageEditModularPipeline,
QwenImageModularPipeline,
)
from .stable_diffusion_xl import StableDiffusionXLAutoBlocks, StableDiffusionXLModularPipeline
from .wan import WanAutoBlocks, WanModularPipeline
else:
@@ -454,6 +454,9 @@ class FluxImg2ImgSetTimestepsStep(ModularPipelineBlocks):
block_state = self.get_block_state(state)
block_state.device = components._execution_device
block_state.height = block_state.height or components.default_height
block_state.width = block_state.width or components.default_width
scheduler = components.scheduler
transformer = components.transformer
batch_size = block_state.batch_size * block_state.num_images_per_prompt
@@ -659,8 +662,6 @@ class FluxImg2ImgPrepareLatentsStep(ModularPipelineBlocks):
def __call__(self, components: FluxModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
block_state.height = block_state.height or components.default_height
block_state.width = block_state.width or components.default_width
block_state.device = components._execution_device
block_state.dtype = torch.bfloat16 # TODO: okay to hardcode this?
block_state.num_channels_latents = components.num_channels_latents
@@ -220,7 +220,7 @@ class FluxDenoiseStep(FluxDenoiseLoopWrapper):
return (
"Denoise step that iteratively denoise the latents. \n"
"Its loop logic is defined in `FluxDenoiseLoopWrapper.__call__` method \n"
"At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
"At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
" - `FluxLoopDenoiser`\n"
" - `FluxLoopAfterDenoiser`\n"
"This block supports both text2image and img2img tasks."
@@ -148,8 +148,8 @@ TEXT2IMAGE_BLOCKS = InsertableDict(
[
("text_encoder", FluxTextEncoderStep),
("input", FluxInputStep),
("set_timesteps", FluxSetTimestepsStep),
("prepare_latents", FluxPrepareLatentsStep),
("set_timesteps", FluxSetTimestepsStep),
("denoise", FluxDenoiseStep),
("decode", FluxDecodeStep),
]
@@ -56,6 +56,8 @@ MODULAR_PIPELINE_MAPPING = OrderedDict(
("stable-diffusion-xl", "StableDiffusionXLModularPipeline"),
("wan", "WanModularPipeline"),
("flux", "FluxModularPipeline"),
("qwenimage", "QwenImageModularPipeline"),
("qwenimage-edit", "QwenImageEditModularPipeline"),
]
)
@@ -64,6 +66,8 @@ MODULAR_PIPELINE_BLOCKS_MAPPING = OrderedDict(
("StableDiffusionXLModularPipeline", "StableDiffusionXLAutoBlocks"),
("WanModularPipeline", "WanAutoBlocks"),
("FluxModularPipeline", "FluxAutoBlocks"),
("QwenImageModularPipeline", "QwenImageAutoBlocks"),
("QwenImageEditModularPipeline", "QwenImageEditAutoBlocks"),
]
)
@@ -128,6 +132,15 @@ class PipelineState:
"""
return {**self.__dict__}
def __getattr__(self, name):
"""
Allow attribute access to intermediate values. If an attribute is not found in the object, look for it in the
intermediates dict.
"""
if name in self.values:
return self.values[name]
raise AttributeError(f"'{self.__class__.__name__}' object has no attribute '{name}'")
def __repr__(self):
def format_value(v):
if hasattr(v, "shape") and hasattr(v, "dtype"):
@@ -220,7 +233,7 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
Base class for all Pipeline Blocks: PipelineBlock, AutoPipelineBlocks, SequentialPipelineBlocks,
LoopSequentialPipelineBlocks
[`ModularPipelineBlocks`] provides method to load and save the defination of pipeline blocks.
[`ModularPipelineBlocks`] provides method to load and save the definition of pipeline blocks.
<Tip warning={true}>
@@ -290,7 +303,7 @@ class ModularPipelineBlocks(ConfigMixin, PushToHubMixin):
def from_pretrained(
cls,
pretrained_model_name_or_path: str,
trust_remote_code: Optional[bool] = None,
trust_remote_code: bool = False,
**kwargs,
):
hub_kwargs_names = [
@@ -539,8 +552,11 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
def __init__(self):
sub_blocks = InsertableDict()
for block_name, block_cls in zip(self.block_names, self.block_classes):
sub_blocks[block_name] = block_cls()
for block_name, block in zip(self.block_names, self.block_classes):
if inspect.isclass(block):
sub_blocks[block_name] = block()
else:
sub_blocks[block_name] = block
self.sub_blocks = sub_blocks
if not (len(self.block_classes) == len(self.block_names) == len(self.block_trigger_inputs)):
raise ValueError(
@@ -638,7 +654,7 @@ class AutoPipelineBlocks(ModularPipelineBlocks):
break
if block is None:
logger.warning(f"skipping auto block: {self.__class__.__name__}")
logger.info(f"skipping auto block: {self.__class__.__name__}")
return pipeline, state
try:
@@ -821,7 +837,9 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
return expected_configs
@classmethod
def from_blocks_dict(cls, blocks_dict: Dict[str, Any]) -> "SequentialPipelineBlocks":
def from_blocks_dict(
cls, blocks_dict: Dict[str, Any], description: Optional[str] = None
) -> "SequentialPipelineBlocks":
"""Creates a SequentialPipelineBlocks instance from a dictionary of blocks.
Args:
@@ -843,12 +861,19 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
instance.block_classes = [block.__class__ for block in sub_blocks.values()]
instance.block_names = list(sub_blocks.keys())
instance.sub_blocks = sub_blocks
if description is not None:
instance.description = description
return instance
def __init__(self):
sub_blocks = InsertableDict()
for block_name, block_cls in zip(self.block_names, self.block_classes):
sub_blocks[block_name] = block_cls()
for block_name, block in zip(self.block_names, self.block_classes):
if inspect.isclass(block):
sub_blocks[block_name] = block()
else:
sub_blocks[block_name] = block
self.sub_blocks = sub_blocks
def _get_inputs(self):
@@ -1271,8 +1296,11 @@ class LoopSequentialPipelineBlocks(ModularPipelineBlocks):
def __init__(self):
sub_blocks = InsertableDict()
for block_name, block_cls in zip(self.block_names, self.block_classes):
sub_blocks[block_name] = block_cls()
for block_name, block in zip(self.block_names, self.block_classes):
if inspect.isclass(block):
sub_blocks[block_name] = block()
else:
sub_blocks[block_name] = block
self.sub_blocks = sub_blocks
@classmethod
@@ -1409,7 +1437,7 @@ class LoopSequentialPipelineBlocks(ModularPipelineBlocks):
# YiYi TODO:
# 1. look into the serialization of modular_model_index.json, make sure the items are properly ordered like model_index.json (currently a mess)
# 2. do we need ConfigSpec? the are basically just key/val kwargs
# 3. imnprove docstring and potentially add validator for methods where we accpet kwargs to be passed to from_pretrained/save_pretrained/load_default_components(), load_components()
# 3. imnprove docstring and potentially add validator for methods where we accept kwargs to be passed to from_pretrained/save_pretrained/load_components()
class ModularPipeline(ConfigMixin, PushToHubMixin):
"""
Base class for all Modular pipelines.
@@ -1450,9 +1478,10 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
Args:
blocks: `ModularPipelineBlocks` instance. If None, will attempt to load
default blocks based on the pipeline class name.
pretrained_model_name_or_path: Path to a pretrained pipeline configuration. If provided,
will load component specs (only for from_pretrained components) and config values from the saved
modular_model_index.json file.
pretrained_model_name_or_path: Path to a pretrained pipeline configuration. Can be None if the pipeline
does not require any additional loading config. If provided, will first try to load component specs
(only for from_pretrained components) and config values from `modular_model_index.json`, then
fallback to `model_index.json` for compatibility with standard non-modular repositories.
components_manager:
Optional ComponentsManager for managing multiple component cross different pipelines and apply
offloading strategies.
@@ -1478,7 +1507,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
- Components with default_creation_method="from_config" are created immediately, its specs are not included
in config dict and will not be saved in `modular_model_index.json`
- Components with default_creation_method="from_pretrained" are set to None and can be loaded later with
`load_default_components()`/`load_components()`
`load_components()` (with or without specific component names)
- The pipeline's config dict is populated with component specs (only for from_pretrained components) and
config values, which will be saved as `modular_model_index.json` during `save_pretrained`
- The pipeline's config dict is also used to store the pipeline blocks's class name, which will be saved as
@@ -1501,18 +1530,70 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
# update component_specs and config_specs from modular_repo
if pretrained_model_name_or_path is not None:
config_dict = self.load_config(pretrained_model_name_or_path, **kwargs)
cache_dir = kwargs.pop("cache_dir", None)
force_download = kwargs.pop("force_download", False)
proxies = kwargs.pop("proxies", None)
token = kwargs.pop("token", None)
local_files_only = kwargs.pop("local_files_only", False)
revision = kwargs.pop("revision", None)
for name, value in config_dict.items():
# all the components in modular_model_index.json are from_pretrained components
if name in self._component_specs and isinstance(value, (tuple, list)) and len(value) == 3:
library, class_name, component_spec_dict = value
component_spec = self._dict_to_component_spec(name, component_spec_dict)
component_spec.default_creation_method = "from_pretrained"
self._component_specs[name] = component_spec
load_config_kwargs = {
"cache_dir": cache_dir,
"force_download": force_download,
"proxies": proxies,
"token": token,
"local_files_only": local_files_only,
"revision": revision,
}
# try to load modular_model_index.json
try:
config_dict = self.load_config(pretrained_model_name_or_path, **load_config_kwargs)
except EnvironmentError as e:
logger.debug(f"modular_model_index.json not found: {e}")
config_dict = None
elif name in self._config_specs:
self._config_specs[name].default = value
# update component_specs and config_specs based on modular_model_index.json
if config_dict is not None:
for name, value in config_dict.items():
# all the components in modular_model_index.json are from_pretrained components
if name in self._component_specs and isinstance(value, (tuple, list)) and len(value) == 3:
library, class_name, component_spec_dict = value
component_spec = self._dict_to_component_spec(name, component_spec_dict)
component_spec.default_creation_method = "from_pretrained"
self._component_specs[name] = component_spec
elif name in self._config_specs:
self._config_specs[name].default = value
# if modular_model_index.json is not found, try to load model_index.json
else:
logger.debug(" loading config from model_index.json")
try:
from diffusers import DiffusionPipeline
config_dict = DiffusionPipeline.load_config(pretrained_model_name_or_path, **load_config_kwargs)
except EnvironmentError as e:
logger.debug(f" model_index.json not found in the repo: {e}")
config_dict = None
# update component_specs and config_specs based on model_index.json
if config_dict is not None:
for name, value in config_dict.items():
if name in self._component_specs and isinstance(value, (tuple, list)) and len(value) == 2:
library, class_name = value
component_spec_dict = {
"repo": pretrained_model_name_or_path,
"subfolder": name,
"type_hint": (library, class_name),
}
component_spec = self._dict_to_component_spec(name, component_spec_dict)
component_spec.default_creation_method = "from_pretrained"
self._component_specs[name] = component_spec
elif name in self._config_specs:
self._config_specs[name].default = value
if len(kwargs) > 0:
logger.warning(f"Unexpected input '{kwargs.keys()}' provided. This input will be ignored.")
register_components_dict = {}
for name, component_spec in self._component_specs.items():
@@ -1541,20 +1622,6 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
params[input_param.name] = input_param.default
return params
def load_default_components(self, **kwargs):
"""
Load from_pretrained components using the loading specs in the config dict.
Args:
**kwargs: Additional arguments passed to `from_pretrained` method, e.g. torch_dtype, cache_dir, etc.
"""
names = [
name
for name in self._component_specs.keys()
if self._component_specs[name].default_creation_method == "from_pretrained"
]
self.load_components(names=names, **kwargs)
@classmethod
@validate_hf_hub_args
def from_pretrained(
@@ -1570,8 +1637,10 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
Args:
pretrained_model_name_or_path (`str` or `os.PathLike`, optional):
Path to a pretrained pipeline configuration. If provided, will load component specs (only for
from_pretrained components) and config values from the modular_model_index.json file.
Path to a pretrained pipeline configuration. It will first try to load config from
`modular_model_index.json`, then fallback to `model_index.json` for compatibility with standard
non-modular repositories. If the repo does not contain any pipeline config, it will be set to None
during initialization.
trust_remote_code (`bool`, optional):
Whether to trust remote code when loading the pipeline, need to be set to True if you want to create
pipeline blocks based on the custom code in `pretrained_model_name_or_path`
@@ -1607,11 +1676,35 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
}
try:
# try to load modular_model_index.json
config_dict = cls.load_config(pretrained_model_name_or_path, **load_config_kwargs)
except EnvironmentError as e:
logger.debug(f" modular_model_index.json not found in the repo: {e}")
config_dict = None
if config_dict is not None:
pipeline_class = _get_pipeline_class(cls, config=config_dict)
except EnvironmentError:
pipeline_class = cls
pretrained_model_name_or_path = None
else:
try:
logger.debug(" try to load model_index.json")
from diffusers import DiffusionPipeline
from diffusers.pipelines.auto_pipeline import _get_model
config_dict = DiffusionPipeline.load_config(pretrained_model_name_or_path, **load_config_kwargs)
except EnvironmentError as e:
logger.debug(f" model_index.json not found in the repo: {e}")
if config_dict is not None:
logger.debug(" try to determine the modular pipeline class from model_index.json")
standard_pipeline_class = _get_pipeline_class(cls, config=config_dict)
model_name = _get_model(standard_pipeline_class.__name__)
pipeline_class_name = MODULAR_PIPELINE_MAPPING.get(model_name, ModularPipeline.__name__)
diffusers_module = importlib.import_module("diffusers")
pipeline_class = getattr(diffusers_module, pipeline_class_name)
else:
# there is no config for modular pipeline, assuming that the pipeline block does not need any from_pretrained components
pipeline_class = cls
pretrained_model_name_or_path = None
pipeline = pipeline_class(
blocks=blocks,
@@ -1682,8 +1775,8 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
- non from_pretrained components are created during __init__ and registered as the object itself
- Components are updated with the `update_components()` method: e.g. loader.update_components(unet=unet) or
loader.update_components(guider=guider_spec)
- (from_pretrained) Components are loaded with the `load_default_components()` method: e.g.
loader.load_default_components(names=["unet"])
- (from_pretrained) Components are loaded with the `load_components()` method: e.g.
loader.load_components(names=["unet"]) or loader.load_components() to load all default components
Args:
**kwargs: Keyword arguments where keys are component names and values are component objects.
@@ -1949,17 +2042,31 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
for name, component in passed_components.items():
current_component_spec = self._component_specs[name]
# warn if type changed
# log if type changed
if current_component_spec.type_hint is not None and not isinstance(
component, current_component_spec.type_hint
):
logger.warning(
logger.info(
f"ModularPipeline.update_components: adding {name} with new type: {component.__class__.__name__}, previous type: {current_component_spec.type_hint.__name__}"
)
# update _component_specs based on the new component
new_component_spec = ComponentSpec.from_component(name, component)
if new_component_spec.default_creation_method != current_component_spec.default_creation_method:
if component is None:
new_component_spec = current_component_spec
if hasattr(self, name) and getattr(self, name) is not None:
logger.warning(f"ModularPipeline.update_components: setting {name} to None (spec unchanged)")
elif current_component_spec.default_creation_method == "from_pretrained" and not (
hasattr(component, "_diffusers_load_id") and component._diffusers_load_id is not None
):
logger.warning(
f"ModularPipeline.update_components: {name} has no valid _diffusers_load_id. "
f"This will result in empty loading spec, use ComponentSpec.load() for proper specs"
)
new_component_spec = ComponentSpec(name=name, type_hint=type(component))
else:
new_component_spec = ComponentSpec.from_component(name, component)
if new_component_spec.default_creation_method != current_component_spec.default_creation_method:
logger.info(
f"ModularPipeline.update_components: changing the default_creation_method of {name} from {current_component_spec.default_creation_method} to {new_component_spec.default_creation_method}."
)
@@ -1980,7 +2087,7 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
if current_component_spec.type_hint is not None and not isinstance(
created_components[name], current_component_spec.type_hint
):
logger.warning(
logger.info(
f"ModularPipeline.update_components: adding {name} with new type: {created_components[name].__class__.__name__}, previous type: {current_component_spec.type_hint.__name__}"
)
# update _component_specs based on the user passed component_spec
@@ -1995,13 +2102,14 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
self.register_to_config(**config_to_register)
# YiYi TODO: support map for additional from_pretrained kwargs
# YiYi/Dhruv TODO: consolidate load_components and load_default_components?
def load_components(self, names: Union[List[str], str], **kwargs):
def load_components(self, names: Optional[Union[List[str], str]] = None, **kwargs):
"""
Load selected components from specs.
Args:
names: List of component names to load; by default will not load any components
names: List of component names to load. If None, will load all components with
default_creation_method == "from_pretrained". If provided as a list or string, will load only the
specified components.
**kwargs: additional kwargs to be passed to `from_pretrained()`.Can be:
- a single value to be applied to all components to be loaded, e.g. torch_dtype=torch.bfloat16
- a dict, e.g. torch_dtype={"unet": torch.bfloat16, "default": torch.float32}
@@ -2009,7 +2117,13 @@ class ModularPipeline(ConfigMixin, PushToHubMixin):
`variant`, `revision`, etc.
"""
if isinstance(names, str):
if names is None:
names = [
name
for name in self._component_specs.keys()
if self._component_specs[name].default_creation_method == "from_pretrained"
]
elif isinstance(names, str):
names = [names]
elif not isinstance(names, list):
raise ValueError(f"Invalid type for names: {type(names)}")
@@ -384,14 +384,14 @@ class ModularNode(ConfigMixin):
# pass or create a default param dict for each input
# e.g. for prompt,
# prompt = {
# "name": "text_input", # the name of the input in node defination, could be different from the input name in diffusers
# "name": "text_input", # the name of the input in node definition, could be different from the input name in diffusers
# "label": "Prompt",
# "type": "string",
# "default": "a bear sitting in a chair drinking a milkshake",
# "display": "textarea"}
# if type is not specified, it'll be a "custom" param of its own type
# e.g. you can pass ModularNode(scheduler = {name :"scheduler"})
# it will get this spec in node defination {"scheduler": {"label": "Scheduler", "type": "scheduler", "display": "input"}}
# it will get this spec in node definition {"scheduler": {"label": "Scheduler", "type": "scheduler", "display": "input"}}
# name can be a dict, in that case, it is part of a "dict" input in mellon nodes, e.g. text_encoder= {name: {"text_encoders": "text_encoder"}}
inputs = self.blocks.inputs + self.blocks.intermediate_inputs
for inp in inputs:
@@ -0,0 +1,75 @@
from typing import TYPE_CHECKING
from ...utils import (
DIFFUSERS_SLOW_IMPORT,
OptionalDependencyNotAvailable,
_LazyModule,
get_objects_from_module,
is_torch_available,
is_transformers_available,
)
_dummy_objects = {}
_import_structure = {}
try:
if not (is_transformers_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils import dummy_torch_and_transformers_objects # noqa F403
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
_import_structure["encoders"] = ["QwenImageTextEncoderStep"]
_import_structure["modular_blocks"] = [
"ALL_BLOCKS",
"AUTO_BLOCKS",
"CONTROLNET_BLOCKS",
"EDIT_AUTO_BLOCKS",
"EDIT_BLOCKS",
"EDIT_INPAINT_BLOCKS",
"IMAGE2IMAGE_BLOCKS",
"INPAINT_BLOCKS",
"TEXT2IMAGE_BLOCKS",
"QwenImageAutoBlocks",
"QwenImageEditAutoBlocks",
]
_import_structure["modular_pipeline"] = ["QwenImageEditModularPipeline", "QwenImageModularPipeline"]
if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
try:
if not (is_transformers_available() and is_torch_available()):
raise OptionalDependencyNotAvailable()
except OptionalDependencyNotAvailable:
from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
else:
from .encoders import (
QwenImageTextEncoderStep,
)
from .modular_blocks import (
ALL_BLOCKS,
AUTO_BLOCKS,
CONTROLNET_BLOCKS,
EDIT_AUTO_BLOCKS,
EDIT_BLOCKS,
EDIT_INPAINT_BLOCKS,
IMAGE2IMAGE_BLOCKS,
INPAINT_BLOCKS,
TEXT2IMAGE_BLOCKS,
QwenImageAutoBlocks,
QwenImageEditAutoBlocks,
)
from .modular_pipeline import QwenImageEditModularPipeline, QwenImageModularPipeline
else:
import sys
sys.modules[__name__] = _LazyModule(
__name__,
globals()["__file__"],
_import_structure,
module_spec=__spec__,
)
for name, value in _dummy_objects.items():
setattr(sys.modules[__name__], name, value)
@@ -0,0 +1,727 @@
# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import inspect
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
from ...models import QwenImageControlNetModel, QwenImageMultiControlNetModel
from ...schedulers import FlowMatchEulerDiscreteScheduler
from ...utils.torch_utils import randn_tensor, unwrap_module
from ..modular_pipeline import ModularPipelineBlocks, PipelineState
from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier
# Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.calculate_shift
def calculate_shift(
image_seq_len,
base_seq_len: int = 256,
max_seq_len: int = 4096,
base_shift: float = 0.5,
max_shift: float = 1.15,
):
m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
b = base_shift - m * base_seq_len
mu = image_seq_len * m + b
return mu
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
def retrieve_timesteps(
scheduler,
num_inference_steps: Optional[int] = None,
device: Optional[Union[str, torch.device]] = None,
timesteps: Optional[List[int]] = None,
sigmas: Optional[List[float]] = None,
**kwargs,
):
r"""
Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
Args:
scheduler (`SchedulerMixin`):
The scheduler to get timesteps from.
num_inference_steps (`int`):
The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
must be `None`.
device (`str` or `torch.device`, *optional*):
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
timesteps (`List[int]`, *optional*):
Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
`num_inference_steps` and `sigmas` must be `None`.
sigmas (`List[float]`, *optional*):
Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
`num_inference_steps` and `timesteps` must be `None`.
Returns:
`Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
second element is the number of inference steps.
"""
if timesteps is not None and sigmas is not None:
raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
if timesteps is not None:
accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
if not accepts_timesteps:
raise ValueError(
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
f" timestep schedules. Please check whether you are using the correct scheduler."
)
scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
timesteps = scheduler.timesteps
num_inference_steps = len(timesteps)
elif sigmas is not None:
accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
if not accept_sigmas:
raise ValueError(
f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
f" sigmas schedules. Please check whether you are using the correct scheduler."
)
scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
timesteps = scheduler.timesteps
num_inference_steps = len(timesteps)
else:
scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
timesteps = scheduler.timesteps
return timesteps, num_inference_steps
# modified from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_img2img.StableDiffusion3Img2ImgPipeline.get_timesteps
def get_timesteps(scheduler, num_inference_steps, strength):
# get the original timestep using init_timestep
init_timestep = min(num_inference_steps * strength, num_inference_steps)
t_start = int(max(num_inference_steps - init_timestep, 0))
timesteps = scheduler.timesteps[t_start * scheduler.order :]
if hasattr(scheduler, "set_begin_index"):
scheduler.set_begin_index(t_start * scheduler.order)
return timesteps, num_inference_steps - t_start
# Prepare Latents steps
class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return "Prepare initial random noise for the generation process"
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="height"),
InputParam(name="width"),
InputParam(name="num_images_per_prompt", default=1),
InputParam(name="generator"),
InputParam(
name="batch_size",
required=True,
type_hint=int,
description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt. Can be generated in input step.",
),
InputParam(
name="dtype",
required=True,
type_hint=torch.dtype,
description="The dtype of the model inputs, can be generated in input step.",
),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="latents",
type_hint=torch.Tensor,
description="The initial latents to use for the denoising process",
),
]
@staticmethod
def check_inputs(height, width, vae_scale_factor):
if height is not None and height % (vae_scale_factor * 2) != 0:
raise ValueError(f"Height must be divisible by {vae_scale_factor * 2} but is {height}")
if width is not None and width % (vae_scale_factor * 2) != 0:
raise ValueError(f"Width must be divisible by {vae_scale_factor * 2} but is {width}")
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
self.check_inputs(
height=block_state.height,
width=block_state.width,
vae_scale_factor=components.vae_scale_factor,
)
device = components._execution_device
batch_size = block_state.batch_size * block_state.num_images_per_prompt
# we can update the height and width here since it's used to generate the initial
block_state.height = block_state.height or components.default_height
block_state.width = block_state.width or components.default_width
# VAE applies 8x compression on images but we must also account for packing which requires
# latent height and width to be divisible by 2.
latent_height = 2 * (int(block_state.height) // (components.vae_scale_factor * 2))
latent_width = 2 * (int(block_state.width) // (components.vae_scale_factor * 2))
shape = (batch_size, components.num_channels_latents, 1, latent_height, latent_width)
if isinstance(block_state.generator, list) and len(block_state.generator) != batch_size:
raise ValueError(
f"You have passed a list of generators of length {len(block_state.generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
block_state.latents = randn_tensor(
shape, generator=block_state.generator, device=device, dtype=block_state.dtype
)
block_state.latents = components.pachifier.pack_latents(block_state.latents)
self.set_block_state(state, block_state)
return components, state
class QwenImagePrepareLatentsWithStrengthStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return "Step that adds noise to image latents for image-to-image/inpainting. Should be run after set_timesteps, prepare_latents. Both noise and image latents should alreadybe patchified."
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(
name="latents",
required=True,
type_hint=torch.Tensor,
description="The initial random noised, can be generated in prepare latent step.",
),
InputParam(
name="image_latents",
required=True,
type_hint=torch.Tensor,
description="The image latents to use for the denoising process. Can be generated in vae encoder and packed in input step.",
),
InputParam(
name="timesteps",
required=True,
type_hint=torch.Tensor,
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="initial_noise",
type_hint=torch.Tensor,
description="The initial random noised used for inpainting denoising.",
),
]
@staticmethod
def check_inputs(image_latents, latents):
if image_latents.shape[0] != latents.shape[0]:
raise ValueError(
f"`image_latents` must have have same batch size as `latents`, but got {image_latents.shape[0]} and {latents.shape[0]}"
)
if image_latents.ndim != 3:
raise ValueError(f"`image_latents` must have 3 dimensions (patchified), but got {image_latents.ndim}")
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
self.check_inputs(
image_latents=block_state.image_latents,
latents=block_state.latents,
)
# prepare latent timestep
latent_timestep = block_state.timesteps[:1].repeat(block_state.latents.shape[0])
# make copy of initial_noise
block_state.initial_noise = block_state.latents
# scale noise
block_state.latents = components.scheduler.scale_noise(
block_state.image_latents, latent_timestep, block_state.latents
)
self.set_block_state(state, block_state)
return components, state
class QwenImageCreateMaskLatentsStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return "Step that creates mask latents from preprocessed mask_image by interpolating to latent space."
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(
name="processed_mask_image",
required=True,
type_hint=torch.Tensor,
description="The processed mask to use for the inpainting process.",
),
InputParam(name="height", required=True),
InputParam(name="width", required=True),
InputParam(name="dtype", required=True),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="mask", type_hint=torch.Tensor, description="The mask to use for the inpainting process."
),
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
device = components._execution_device
# VAE applies 8x compression on images but we must also account for packing which requires
# latent height and width to be divisible by 2.
height_latents = 2 * (int(block_state.height) // (components.vae_scale_factor * 2))
width_latents = 2 * (int(block_state.width) // (components.vae_scale_factor * 2))
block_state.mask = torch.nn.functional.interpolate(
block_state.processed_mask_image,
size=(height_latents, width_latents),
)
block_state.mask = block_state.mask.unsqueeze(2)
block_state.mask = block_state.mask.repeat(1, components.num_channels_latents, 1, 1, 1)
block_state.mask = block_state.mask.to(device=device, dtype=block_state.dtype)
block_state.mask = components.pachifier.pack_latents(block_state.mask)
self.set_block_state(state, block_state)
return components, state
# Set Timesteps steps
class QwenImageSetTimestepsStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return "Step that sets the the scheduler's timesteps for text-to-image generation. Should be run after prepare latents step."
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="num_inference_steps", default=50),
InputParam(name="sigmas"),
InputParam(
name="latents",
required=True,
type_hint=torch.Tensor,
description="The latents to use for the denoising process, used to calculate the image sequence length.",
),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="timesteps", type_hint=torch.Tensor, description="The timesteps to use for the denoising process"
),
]
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
device = components._execution_device
sigmas = (
np.linspace(1.0, 1 / block_state.num_inference_steps, block_state.num_inference_steps)
if block_state.sigmas is None
else block_state.sigmas
)
mu = calculate_shift(
image_seq_len=block_state.latents.shape[1],
base_seq_len=components.scheduler.config.get("base_image_seq_len", 256),
max_seq_len=components.scheduler.config.get("max_image_seq_len", 4096),
base_shift=components.scheduler.config.get("base_shift", 0.5),
max_shift=components.scheduler.config.get("max_shift", 1.15),
)
block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps(
scheduler=components.scheduler,
num_inference_steps=block_state.num_inference_steps,
device=device,
sigmas=sigmas,
mu=mu,
)
components.scheduler.set_begin_index(0)
self.set_block_state(state, block_state)
return components, state
class QwenImageSetTimestepsWithStrengthStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return "Step that sets the the scheduler's timesteps for image-to-image generation, and inpainting. Should be run after prepare latents step."
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="num_inference_steps", default=50),
InputParam(name="sigmas"),
InputParam(
name="latents",
required=True,
type_hint=torch.Tensor,
description="The latents to use for the denoising process, used to calculate the image sequence length.",
),
InputParam(name="strength", default=0.9),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="timesteps",
type_hint=torch.Tensor,
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
),
]
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
device = components._execution_device
sigmas = (
np.linspace(1.0, 1 / block_state.num_inference_steps, block_state.num_inference_steps)
if block_state.sigmas is None
else block_state.sigmas
)
mu = calculate_shift(
image_seq_len=block_state.latents.shape[1],
base_seq_len=components.scheduler.config.get("base_image_seq_len", 256),
max_seq_len=components.scheduler.config.get("max_image_seq_len", 4096),
base_shift=components.scheduler.config.get("base_shift", 0.5),
max_shift=components.scheduler.config.get("max_shift", 1.15),
)
block_state.timesteps, block_state.num_inference_steps = retrieve_timesteps(
scheduler=components.scheduler,
num_inference_steps=block_state.num_inference_steps,
device=device,
sigmas=sigmas,
mu=mu,
)
block_state.timesteps, block_state.num_inference_steps = get_timesteps(
scheduler=components.scheduler,
num_inference_steps=block_state.num_inference_steps,
strength=block_state.strength,
)
self.set_block_state(state, block_state)
return components, state
# other inputs for denoiser
## RoPE inputs for denoiser
class QwenImageRoPEInputsStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return (
"Step that prepares the RoPE inputs for the denoising process. Should be place after prepare_latents step"
)
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="batch_size", required=True),
InputParam(name="height", required=True),
InputParam(name="width", required=True),
InputParam(name="prompt_embeds_mask"),
InputParam(name="negative_prompt_embeds_mask"),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="img_shapes",
type_hint=List[List[Tuple[int, int, int]]],
description="The shapes of the images latents, used for RoPE calculation",
),
OutputParam(
name="txt_seq_lens",
kwargs_type="denoiser_input_fields",
type_hint=List[int],
description="The sequence lengths of the prompt embeds, used for RoPE calculation",
),
OutputParam(
name="negative_txt_seq_lens",
kwargs_type="denoiser_input_fields",
type_hint=List[int],
description="The sequence lengths of the negative prompt embeds, used for RoPE calculation",
),
]
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
block_state.img_shapes = [
[
(
1,
block_state.height // components.vae_scale_factor // 2,
block_state.width // components.vae_scale_factor // 2,
)
]
* block_state.batch_size
]
block_state.txt_seq_lens = (
block_state.prompt_embeds_mask.sum(dim=1).tolist() if block_state.prompt_embeds_mask is not None else None
)
block_state.negative_txt_seq_lens = (
block_state.negative_prompt_embeds_mask.sum(dim=1).tolist()
if block_state.negative_prompt_embeds_mask is not None
else None
)
self.set_block_state(state, block_state)
return components, state
class QwenImageEditRoPEInputsStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return "Step that prepares the RoPE inputs for denoising process. This is used in QwenImage Edit. Should be place after prepare_latents step"
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="batch_size", required=True),
InputParam(
name="resized_image", required=True, type_hint=torch.Tensor, description="The resized image input"
),
InputParam(name="height", required=True),
InputParam(name="width", required=True),
InputParam(name="prompt_embeds_mask"),
InputParam(name="negative_prompt_embeds_mask"),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="img_shapes",
type_hint=List[List[Tuple[int, int, int]]],
description="The shapes of the images latents, used for RoPE calculation",
),
OutputParam(
name="txt_seq_lens",
kwargs_type="denoiser_input_fields",
type_hint=List[int],
description="The sequence lengths of the prompt embeds, used for RoPE calculation",
),
OutputParam(
name="negative_txt_seq_lens",
kwargs_type="denoiser_input_fields",
type_hint=List[int],
description="The sequence lengths of the negative prompt embeds, used for RoPE calculation",
),
]
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
# for edit, image size can be different from the target size (height/width)
image = (
block_state.resized_image[0] if isinstance(block_state.resized_image, list) else block_state.resized_image
)
image_width, image_height = image.size
block_state.img_shapes = [
[
(
1,
block_state.height // components.vae_scale_factor // 2,
block_state.width // components.vae_scale_factor // 2,
),
(1, image_height // components.vae_scale_factor // 2, image_width // components.vae_scale_factor // 2),
]
] * block_state.batch_size
block_state.txt_seq_lens = (
block_state.prompt_embeds_mask.sum(dim=1).tolist() if block_state.prompt_embeds_mask is not None else None
)
block_state.negative_txt_seq_lens = (
block_state.negative_prompt_embeds_mask.sum(dim=1).tolist()
if block_state.negative_prompt_embeds_mask is not None
else None
)
self.set_block_state(state, block_state)
return components, state
## ControlNet inputs for denoiser
class QwenImageControlNetBeforeDenoiserStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec("controlnet", QwenImageControlNetModel),
]
@property
def description(self) -> str:
return "step that prepare inputs for controlnet. Insert before the Denoise Step, after set_timesteps step."
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("control_guidance_start", default=0.0),
InputParam("control_guidance_end", default=1.0),
InputParam("controlnet_conditioning_scale", default=1.0),
InputParam("control_image_latents", required=True),
InputParam(
"timesteps",
required=True,
type_hint=torch.Tensor,
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam("controlnet_keep", type_hint=List[float], description="The controlnet keep values"),
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
controlnet = unwrap_module(components.controlnet)
# control_guidance_start/control_guidance_end (align format)
if not isinstance(block_state.control_guidance_start, list) and isinstance(
block_state.control_guidance_end, list
):
block_state.control_guidance_start = len(block_state.control_guidance_end) * [
block_state.control_guidance_start
]
elif not isinstance(block_state.control_guidance_end, list) and isinstance(
block_state.control_guidance_start, list
):
block_state.control_guidance_end = len(block_state.control_guidance_start) * [
block_state.control_guidance_end
]
elif not isinstance(block_state.control_guidance_start, list) and not isinstance(
block_state.control_guidance_end, list
):
mult = (
len(block_state.control_image_latents) if isinstance(controlnet, QwenImageMultiControlNetModel) else 1
)
block_state.control_guidance_start, block_state.control_guidance_end = (
mult * [block_state.control_guidance_start],
mult * [block_state.control_guidance_end],
)
# controlnet_conditioning_scale (align format)
if isinstance(controlnet, QwenImageMultiControlNetModel) and isinstance(
block_state.controlnet_conditioning_scale, float
):
block_state.controlnet_conditioning_scale = [block_state.controlnet_conditioning_scale] * mult
# controlnet_keep
block_state.controlnet_keep = []
for i in range(len(block_state.timesteps)):
keeps = [
1.0 - float(i / len(block_state.timesteps) < s or (i + 1) / len(block_state.timesteps) > e)
for s, e in zip(block_state.control_guidance_start, block_state.control_guidance_end)
]
block_state.controlnet_keep.append(keeps[0] if isinstance(controlnet, QwenImageControlNetModel) else keeps)
self.set_block_state(state, block_state)
return components, state
@@ -0,0 +1,203 @@
# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Union
import numpy as np
import PIL
import torch
from ...configuration_utils import FrozenDict
from ...image_processor import InpaintProcessor, VaeImageProcessor
from ...models import AutoencoderKLQwenImage
from ...utils import logging
from ..modular_pipeline import ModularPipelineBlocks, PipelineState
from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier
logger = logging.get_logger(__name__)
class QwenImageDecoderStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return "Step that decodes the latents to images"
@property
def expected_components(self) -> List[ComponentSpec]:
components = [
ComponentSpec("vae", AutoencoderKLQwenImage),
ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
]
return components
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="height", required=True),
InputParam(name="width", required=True),
InputParam(
name="latents",
required=True,
type_hint=torch.Tensor,
description="The latents to decode, can be generated in the denoise step",
),
]
@property
def intermediate_outputs(self) -> List[str]:
return [
OutputParam(
"images",
type_hint=Union[List[PIL.Image.Image], List[torch.Tensor], List[np.array]],
description="The generated images, can be a PIL.Image.Image, torch.Tensor or a numpy array",
)
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
# YiYi Notes: remove support for output_type = "latents', we can just skip decode/encode step in modular
block_state.latents = components.pachifier.unpack_latents(
block_state.latents, block_state.height, block_state.width
)
block_state.latents = block_state.latents.to(components.vae.dtype)
latents_mean = (
torch.tensor(components.vae.config.latents_mean)
.view(1, components.vae.config.z_dim, 1, 1, 1)
.to(block_state.latents.device, block_state.latents.dtype)
)
latents_std = 1.0 / torch.tensor(components.vae.config.latents_std).view(
1, components.vae.config.z_dim, 1, 1, 1
).to(block_state.latents.device, block_state.latents.dtype)
block_state.latents = block_state.latents / latents_std + latents_mean
block_state.images = components.vae.decode(block_state.latents, return_dict=False)[0][:, :, 0]
self.set_block_state(state, block_state)
return components, state
class QwenImageProcessImagesOutputStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return "postprocess the generated image"
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec(
"image_processor",
VaeImageProcessor,
config=FrozenDict({"vae_scale_factor": 16}),
default_creation_method="from_config",
),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("images", required=True, description="the generated image from decoders step"),
InputParam(
name="output_type",
default="pil",
type_hint=str,
description="The type of the output images, can be 'pil', 'np', 'pt'",
),
]
@staticmethod
def check_inputs(output_type):
if output_type not in ["pil", "np", "pt"]:
raise ValueError(f"Invalid output_type: {output_type}")
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)
self.check_inputs(block_state.output_type)
block_state.images = components.image_processor.postprocess(
image=block_state.images,
output_type=block_state.output_type,
)
self.set_block_state(state, block_state)
return components, state
class QwenImageInpaintProcessImagesOutputStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return "postprocess the generated image, optional apply the mask overally to the original image.."
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec(
"image_mask_processor",
InpaintProcessor,
config=FrozenDict({"vae_scale_factor": 16}),
default_creation_method="from_config",
),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("images", required=True, description="the generated image from decoders step"),
InputParam(
name="output_type",
default="pil",
type_hint=str,
description="The type of the output images, can be 'pil', 'np', 'pt'",
),
InputParam("mask_overlay_kwargs"),
]
@staticmethod
def check_inputs(output_type, mask_overlay_kwargs):
if output_type not in ["pil", "np", "pt"]:
raise ValueError(f"Invalid output_type: {output_type}")
if mask_overlay_kwargs and output_type != "pil":
raise ValueError("only support output_type 'pil' for mask overlay")
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)
self.check_inputs(block_state.output_type, block_state.mask_overlay_kwargs)
if block_state.mask_overlay_kwargs is None:
mask_overlay_kwargs = {}
else:
mask_overlay_kwargs = block_state.mask_overlay_kwargs
block_state.images = components.image_mask_processor.postprocess(
image=block_state.images,
**mask_overlay_kwargs,
)
self.set_block_state(state, block_state)
return components, state
@@ -0,0 +1,668 @@
# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Tuple
import torch
from ...configuration_utils import FrozenDict
from ...guiders import ClassifierFreeGuidance
from ...models import QwenImageControlNetModel, QwenImageTransformer2DModel
from ...schedulers import FlowMatchEulerDiscreteScheduler
from ...utils import logging
from ..modular_pipeline import BlockState, LoopSequentialPipelineBlocks, ModularPipelineBlocks, PipelineState
from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
from .modular_pipeline import QwenImageModularPipeline
logger = logging.get_logger(__name__)
class QwenImageLoopBeforeDenoiser(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return (
"step within the denoising loop that prepares the latent input for the denoiser. "
"This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
"object (e.g. `QwenImageDenoiseLoopWrapper`)"
)
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(
"latents",
required=True,
type_hint=torch.Tensor,
description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
),
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
# one timestep
block_state.timestep = t.expand(block_state.latents.shape[0]).to(block_state.latents.dtype)
block_state.latent_model_input = block_state.latents
return components, block_state
class QwenImageEditLoopBeforeDenoiser(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return (
"step within the denoising loop that prepares the latent input for the denoiser. "
"This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
"object (e.g. `QwenImageDenoiseLoopWrapper`)"
)
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(
"latents",
required=True,
type_hint=torch.Tensor,
description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
),
InputParam(
"image_latents",
required=True,
type_hint=torch.Tensor,
description="The initial image latents to use for the denoising process. Can be encoded in vae_encoder step and packed in prepare_image_latents step.",
),
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
# one timestep
block_state.latent_model_input = torch.cat([block_state.latents, block_state.image_latents], dim=1)
block_state.timestep = t.expand(block_state.latents.shape[0]).to(block_state.latents.dtype)
return components, block_state
class QwenImageLoopBeforeDenoiserControlNet(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec(
"guider",
ClassifierFreeGuidance,
config=FrozenDict({"guidance_scale": 4.0}),
default_creation_method="from_config",
),
ComponentSpec("controlnet", QwenImageControlNetModel),
]
@property
def description(self) -> str:
return (
"step within the denoising loop that runs the controlnet before the denoiser. "
"This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
"object (e.g. `QwenImageDenoiseLoopWrapper`)"
)
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(
"control_image_latents",
required=True,
type_hint=torch.Tensor,
description="The control image to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
),
InputParam(
"controlnet_conditioning_scale",
type_hint=float,
description="The controlnet conditioning scale value to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
),
InputParam(
"controlnet_keep",
required=True,
type_hint=List[float],
description="The controlnet keep values to use for the denoising process. Can be generated in prepare_controlnet_inputs step.",
),
InputParam(
"num_inference_steps",
required=True,
type_hint=int,
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
),
InputParam(
kwargs_type="denoiser_input_fields",
description=(
"All conditional model inputs for the denoiser. "
"It should contain prompt_embeds/negative_prompt_embeds, txt_seq_lens/negative_txt_seq_lens."
),
),
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: int):
# cond_scale for the timestep (controlnet input)
if isinstance(block_state.controlnet_keep[i], list):
block_state.cond_scale = [
c * s for c, s in zip(block_state.controlnet_conditioning_scale, block_state.controlnet_keep[i])
]
else:
controlnet_cond_scale = block_state.controlnet_conditioning_scale
if isinstance(controlnet_cond_scale, list):
controlnet_cond_scale = controlnet_cond_scale[0]
block_state.cond_scale = controlnet_cond_scale * block_state.controlnet_keep[i]
# run controlnet for the guidance batch
controlnet_block_samples = components.controlnet(
hidden_states=block_state.latent_model_input,
controlnet_cond=block_state.control_image_latents,
conditioning_scale=block_state.cond_scale,
timestep=block_state.timestep / 1000,
img_shapes=block_state.img_shapes,
encoder_hidden_states=block_state.prompt_embeds,
encoder_hidden_states_mask=block_state.prompt_embeds_mask,
txt_seq_lens=block_state.txt_seq_lens,
return_dict=False,
)
block_state.additional_cond_kwargs["controlnet_block_samples"] = controlnet_block_samples
return components, block_state
class QwenImageLoopDenoiser(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return (
"step within the denoising loop that denoise the latent input for the denoiser. "
"This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
"object (e.g. `QwenImageDenoiseLoopWrapper`)"
)
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec(
"guider",
ClassifierFreeGuidance,
config=FrozenDict({"guidance_scale": 4.0}),
default_creation_method="from_config",
),
ComponentSpec("transformer", QwenImageTransformer2DModel),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("attention_kwargs"),
InputParam(
"latents",
required=True,
type_hint=torch.Tensor,
description="The latents to use for the denoising process. Can be generated in prepare_latents step.",
),
InputParam(
"num_inference_steps",
required=True,
type_hint=int,
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
),
InputParam(
kwargs_type="denoiser_input_fields",
description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
),
InputParam(
"img_shapes",
required=True,
type_hint=List[Tuple[int, int]],
description="The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.",
),
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
guider_input_fields = {
"encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
"encoder_hidden_states_mask": ("prompt_embeds_mask", "negative_prompt_embeds_mask"),
"txt_seq_lens": ("txt_seq_lens", "negative_txt_seq_lens"),
}
components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
guider_state = components.guider.prepare_inputs(block_state, guider_input_fields)
for guider_state_batch in guider_state:
components.guider.prepare_models(components.transformer)
cond_kwargs = guider_state_batch.as_dict()
cond_kwargs = {k: v for k, v in cond_kwargs.items() if k in guider_input_fields}
# YiYi TODO: add cache context
guider_state_batch.noise_pred = components.transformer(
hidden_states=block_state.latent_model_input,
timestep=block_state.timestep / 1000,
img_shapes=block_state.img_shapes,
attention_kwargs=block_state.attention_kwargs,
return_dict=False,
**cond_kwargs,
**block_state.additional_cond_kwargs,
)[0]
components.guider.cleanup_models(components.transformer)
guider_output = components.guider(guider_state)
# apply guidance rescale
pred_cond_norm = torch.norm(guider_output.pred_cond, dim=-1, keepdim=True)
pred_norm = torch.norm(guider_output.pred, dim=-1, keepdim=True)
block_state.noise_pred = guider_output.pred * (pred_cond_norm / pred_norm)
return components, block_state
class QwenImageEditLoopDenoiser(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return (
"step within the denoising loop that denoise the latent input for the denoiser. "
"This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
"object (e.g. `QwenImageDenoiseLoopWrapper`)"
)
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec(
"guider",
ClassifierFreeGuidance,
config=FrozenDict({"guidance_scale": 4.0}),
default_creation_method="from_config",
),
ComponentSpec("transformer", QwenImageTransformer2DModel),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("attention_kwargs"),
InputParam(
"latents",
required=True,
type_hint=torch.Tensor,
description="The latents to use for the denoising process. Can be generated in prepare_latents step.",
),
InputParam(
"num_inference_steps",
required=True,
type_hint=int,
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
),
InputParam(
kwargs_type="denoiser_input_fields",
description="conditional model inputs for the denoiser: e.g. prompt_embeds, negative_prompt_embeds, etc.",
),
InputParam(
"img_shapes",
required=True,
type_hint=List[Tuple[int, int]],
description="The shape of the image latents for RoPE calculation. Can be generated in prepare_additional_inputs step.",
),
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
guider_input_fields = {
"encoder_hidden_states": ("prompt_embeds", "negative_prompt_embeds"),
"encoder_hidden_states_mask": ("prompt_embeds_mask", "negative_prompt_embeds_mask"),
"txt_seq_lens": ("txt_seq_lens", "negative_txt_seq_lens"),
}
components.guider.set_state(step=i, num_inference_steps=block_state.num_inference_steps, timestep=t)
guider_state = components.guider.prepare_inputs(block_state, guider_input_fields)
for guider_state_batch in guider_state:
components.guider.prepare_models(components.transformer)
cond_kwargs = guider_state_batch.as_dict()
cond_kwargs = {k: v for k, v in cond_kwargs.items() if k in guider_input_fields}
# YiYi TODO: add cache context
guider_state_batch.noise_pred = components.transformer(
hidden_states=block_state.latent_model_input,
timestep=block_state.timestep / 1000,
img_shapes=block_state.img_shapes,
attention_kwargs=block_state.attention_kwargs,
return_dict=False,
**cond_kwargs,
**block_state.additional_cond_kwargs,
)[0]
components.guider.cleanup_models(components.transformer)
guider_output = components.guider(guider_state)
pred = guider_output.pred[:, : block_state.latents.size(1)]
pred_cond = guider_output.pred_cond[:, : block_state.latents.size(1)]
# apply guidance rescale
pred_cond_norm = torch.norm(pred_cond, dim=-1, keepdim=True)
pred_norm = torch.norm(pred, dim=-1, keepdim=True)
block_state.noise_pred = pred * (pred_cond_norm / pred_norm)
return components, block_state
class QwenImageLoopAfterDenoiser(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return (
"step within the denoising loop that updates the latents. "
"This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
"object (e.g. `QwenImageDenoiseLoopWrapper`)"
)
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam("latents", type_hint=torch.Tensor, description="The denoised latents."),
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
latents_dtype = block_state.latents.dtype
block_state.latents = components.scheduler.step(
block_state.noise_pred,
t,
block_state.latents,
return_dict=False,
)[0]
if block_state.latents.dtype != latents_dtype:
if torch.backends.mps.is_available():
# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
block_state.latents = block_state.latents.to(latents_dtype)
return components, block_state
class QwenImageLoopAfterDenoiserInpaint(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return (
"step within the denoising loop that updates the latents using mask and image_latents for inpainting. "
"This block should be used to compose the `sub_blocks` attribute of a `LoopSequentialPipelineBlocks` "
"object (e.g. `QwenImageDenoiseLoopWrapper`)"
)
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(
"mask",
required=True,
type_hint=torch.Tensor,
description="The mask to use for the inpainting process. Can be generated in inpaint prepare latents step.",
),
InputParam(
"image_latents",
required=True,
type_hint=torch.Tensor,
description="The image latents to use for the inpainting process. Can be generated in inpaint prepare latents step.",
),
InputParam(
"initial_noise",
required=True,
type_hint=torch.Tensor,
description="The initial noise to use for the inpainting process. Can be generated in inpaint prepare latents step.",
),
InputParam(
"timesteps",
required=True,
type_hint=torch.Tensor,
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
),
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
block_state.init_latents_proper = block_state.image_latents
if i < len(block_state.timesteps) - 1:
block_state.noise_timestep = block_state.timesteps[i + 1]
block_state.init_latents_proper = components.scheduler.scale_noise(
block_state.init_latents_proper, torch.tensor([block_state.noise_timestep]), block_state.initial_noise
)
block_state.latents = (
1 - block_state.mask
) * block_state.init_latents_proper + block_state.mask * block_state.latents
return components, block_state
class QwenImageDenoiseLoopWrapper(LoopSequentialPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return (
"Pipeline block that iteratively denoise the latents over `timesteps`. "
"The specific steps with each iteration can be customized with `sub_blocks` attributes"
)
@property
def loop_expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler),
]
@property
def loop_inputs(self) -> List[InputParam]:
return [
InputParam(
"timesteps",
required=True,
type_hint=torch.Tensor,
description="The timesteps to use for the denoising process. Can be generated in set_timesteps step.",
),
InputParam(
"num_inference_steps",
required=True,
type_hint=int,
description="The number of inference steps to use for the denoising process. Can be generated in set_timesteps step.",
),
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
block_state.num_warmup_steps = max(
len(block_state.timesteps) - block_state.num_inference_steps * components.scheduler.order, 0
)
block_state.additional_cond_kwargs = {}
with self.progress_bar(total=block_state.num_inference_steps) as progress_bar:
for i, t in enumerate(block_state.timesteps):
components, block_state = self.loop_step(components, block_state, i=i, t=t)
if i == len(block_state.timesteps) - 1 or (
(i + 1) > block_state.num_warmup_steps and (i + 1) % components.scheduler.order == 0
):
progress_bar.update()
self.set_block_state(state, block_state)
return components, state
# composing the denoising loops
class QwenImageDenoiseStep(QwenImageDenoiseLoopWrapper):
block_classes = [
QwenImageLoopBeforeDenoiser,
QwenImageLoopDenoiser,
QwenImageLoopAfterDenoiser,
]
block_names = ["before_denoiser", "denoiser", "after_denoiser"]
@property
def description(self) -> str:
return (
"Denoise step that iteratively denoise the latents. \n"
"Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n"
"At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
" - `QwenImageLoopBeforeDenoiser`\n"
" - `QwenImageLoopDenoiser`\n"
" - `QwenImageLoopAfterDenoiser`\n"
"This block supports text2image and image2image tasks for QwenImage."
)
# composing the inpainting denoising loops
class QwenImageInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
block_classes = [
QwenImageLoopBeforeDenoiser,
QwenImageLoopDenoiser,
QwenImageLoopAfterDenoiser,
QwenImageLoopAfterDenoiserInpaint,
]
block_names = ["before_denoiser", "denoiser", "after_denoiser", "after_denoiser_inpaint"]
@property
def description(self) -> str:
return (
"Denoise step that iteratively denoise the latents. \n"
"Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n"
"At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
" - `QwenImageLoopBeforeDenoiser`\n"
" - `QwenImageLoopDenoiser`\n"
" - `QwenImageLoopAfterDenoiser`\n"
" - `QwenImageLoopAfterDenoiserInpaint`\n"
"This block supports inpainting tasks for QwenImage."
)
# composing the controlnet denoising loops
class QwenImageControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
block_classes = [
QwenImageLoopBeforeDenoiser,
QwenImageLoopBeforeDenoiserControlNet,
QwenImageLoopDenoiser,
QwenImageLoopAfterDenoiser,
]
block_names = ["before_denoiser", "before_denoiser_controlnet", "denoiser", "after_denoiser"]
@property
def description(self) -> str:
return (
"Denoise step that iteratively denoise the latents. \n"
"Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n"
"At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
" - `QwenImageLoopBeforeDenoiser`\n"
" - `QwenImageLoopBeforeDenoiserControlNet`\n"
" - `QwenImageLoopDenoiser`\n"
" - `QwenImageLoopAfterDenoiser`\n"
"This block supports text2img/img2img tasks with controlnet for QwenImage."
)
# composing the controlnet denoising loops
class QwenImageInpaintControlNetDenoiseStep(QwenImageDenoiseLoopWrapper):
block_classes = [
QwenImageLoopBeforeDenoiser,
QwenImageLoopBeforeDenoiserControlNet,
QwenImageLoopDenoiser,
QwenImageLoopAfterDenoiser,
QwenImageLoopAfterDenoiserInpaint,
]
block_names = [
"before_denoiser",
"before_denoiser_controlnet",
"denoiser",
"after_denoiser",
"after_denoiser_inpaint",
]
@property
def description(self) -> str:
return (
"Denoise step that iteratively denoise the latents. \n"
"Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n"
"At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
" - `QwenImageLoopBeforeDenoiser`\n"
" - `QwenImageLoopBeforeDenoiserControlNet`\n"
" - `QwenImageLoopDenoiser`\n"
" - `QwenImageLoopAfterDenoiser`\n"
" - `QwenImageLoopAfterDenoiserInpaint`\n"
"This block supports inpainting tasks with controlnet for QwenImage."
)
# composing the denoising loops
class QwenImageEditDenoiseStep(QwenImageDenoiseLoopWrapper):
block_classes = [
QwenImageEditLoopBeforeDenoiser,
QwenImageEditLoopDenoiser,
QwenImageLoopAfterDenoiser,
]
block_names = ["before_denoiser", "denoiser", "after_denoiser"]
@property
def description(self) -> str:
return (
"Denoise step that iteratively denoise the latents. \n"
"Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n"
"At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
" - `QwenImageEditLoopBeforeDenoiser`\n"
" - `QwenImageEditLoopDenoiser`\n"
" - `QwenImageLoopAfterDenoiser`\n"
"This block supports QwenImage Edit."
)
class QwenImageEditInpaintDenoiseStep(QwenImageDenoiseLoopWrapper):
block_classes = [
QwenImageEditLoopBeforeDenoiser,
QwenImageEditLoopDenoiser,
QwenImageLoopAfterDenoiser,
QwenImageLoopAfterDenoiserInpaint,
]
block_names = ["before_denoiser", "denoiser", "after_denoiser", "after_denoiser_inpaint"]
@property
def description(self) -> str:
return (
"Denoise step that iteratively denoise the latents. \n"
"Its loop logic is defined in `QwenImageDenoiseLoopWrapper.__call__` method \n"
"At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
" - `QwenImageEditLoopBeforeDenoiser`\n"
" - `QwenImageEditLoopDenoiser`\n"
" - `QwenImageLoopAfterDenoiser`\n"
" - `QwenImageLoopAfterDenoiserInpaint`\n"
"This block supports inpainting tasks for QwenImage Edit."
)
@@ -0,0 +1,857 @@
# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict, List, Optional, Union
import PIL
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
from ...configuration_utils import FrozenDict
from ...guiders import ClassifierFreeGuidance
from ...image_processor import InpaintProcessor, VaeImageProcessor, is_valid_image, is_valid_image_imagelist
from ...models import AutoencoderKLQwenImage, QwenImageControlNetModel, QwenImageMultiControlNetModel
from ...pipelines.qwenimage.pipeline_qwenimage_edit import calculate_dimensions
from ...utils import logging
from ...utils.torch_utils import unwrap_module
from ..modular_pipeline import ModularPipelineBlocks, PipelineState
from ..modular_pipeline_utils import ComponentSpec, ConfigSpec, InputParam, OutputParam
from .modular_pipeline import QwenImageModularPipeline
logger = logging.get_logger(__name__)
def _extract_masked_hidden(hidden_states: torch.Tensor, mask: torch.Tensor):
bool_mask = mask.bool()
valid_lengths = bool_mask.sum(dim=1)
selected = hidden_states[bool_mask]
split_result = torch.split(selected, valid_lengths.tolist(), dim=0)
return split_result
def get_qwen_prompt_embeds(
text_encoder,
tokenizer,
prompt: Union[str, List[str]] = None,
prompt_template_encode: str = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
prompt_template_encode_start_idx: int = 34,
tokenizer_max_length: int = 1024,
device: Optional[torch.device] = None,
):
prompt = [prompt] if isinstance(prompt, str) else prompt
template = prompt_template_encode
drop_idx = prompt_template_encode_start_idx
txt = [template.format(e) for e in prompt]
txt_tokens = tokenizer(
txt, max_length=tokenizer_max_length + drop_idx, padding=True, truncation=True, return_tensors="pt"
).to(device)
encoder_hidden_states = text_encoder(
input_ids=txt_tokens.input_ids,
attention_mask=txt_tokens.attention_mask,
output_hidden_states=True,
)
hidden_states = encoder_hidden_states.hidden_states[-1]
split_hidden_states = _extract_masked_hidden(hidden_states, txt_tokens.attention_mask)
split_hidden_states = [e[drop_idx:] for e in split_hidden_states]
attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states]
max_seq_len = max([e.size(0) for e in split_hidden_states])
prompt_embeds = torch.stack(
[torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states]
)
encoder_attention_mask = torch.stack(
[torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list]
)
prompt_embeds = prompt_embeds.to(device=device)
return prompt_embeds, encoder_attention_mask
def get_qwen_prompt_embeds_edit(
text_encoder,
processor,
prompt: Union[str, List[str]] = None,
image: Optional[torch.Tensor] = None,
prompt_template_encode: str = "<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n",
prompt_template_encode_start_idx: int = 64,
device: Optional[torch.device] = None,
):
prompt = [prompt] if isinstance(prompt, str) else prompt
template = prompt_template_encode
drop_idx = prompt_template_encode_start_idx
txt = [template.format(e) for e in prompt]
model_inputs = processor(
text=txt,
images=image,
padding=True,
return_tensors="pt",
).to(device)
outputs = text_encoder(
input_ids=model_inputs.input_ids,
attention_mask=model_inputs.attention_mask,
pixel_values=model_inputs.pixel_values,
image_grid_thw=model_inputs.image_grid_thw,
output_hidden_states=True,
)
hidden_states = outputs.hidden_states[-1]
split_hidden_states = _extract_masked_hidden(hidden_states, model_inputs.attention_mask)
split_hidden_states = [e[drop_idx:] for e in split_hidden_states]
attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states]
max_seq_len = max([e.size(0) for e in split_hidden_states])
prompt_embeds = torch.stack(
[torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states]
)
encoder_attention_mask = torch.stack(
[torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list]
)
prompt_embeds = prompt_embeds.to(device=device)
return prompt_embeds, encoder_attention_mask
# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
def retrieve_latents(
encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
):
if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
return encoder_output.latent_dist.sample(generator)
elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
return encoder_output.latent_dist.mode()
elif hasattr(encoder_output, "latents"):
return encoder_output.latents
else:
raise AttributeError("Could not access latents of provided encoder_output")
# Modified from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._encode_vae_image
def encode_vae_image(
image: torch.Tensor,
vae: AutoencoderKLQwenImage,
generator: torch.Generator,
device: torch.device,
dtype: torch.dtype,
latent_channels: int = 16,
sample_mode: str = "argmax",
):
if not isinstance(image, torch.Tensor):
raise ValueError(f"Expected image to be a tensor, got {type(image)}.")
# preprocessed image should be a 4D tensor: batch_size, num_channels, height, width
if image.dim() == 4:
image = image.unsqueeze(2)
elif image.dim() != 5:
raise ValueError(f"Expected image dims 4 or 5, got {image.dim()}.")
image = image.to(device=device, dtype=dtype)
if isinstance(generator, list):
image_latents = [
retrieve_latents(vae.encode(image[i : i + 1]), generator=generator[i], sample_mode=sample_mode)
for i in range(image.shape[0])
]
image_latents = torch.cat(image_latents, dim=0)
else:
image_latents = retrieve_latents(vae.encode(image), generator=generator, sample_mode=sample_mode)
latents_mean = (
torch.tensor(vae.config.latents_mean)
.view(1, latent_channels, 1, 1, 1)
.to(image_latents.device, image_latents.dtype)
)
latents_std = (
torch.tensor(vae.config.latents_std)
.view(1, latent_channels, 1, 1, 1)
.to(image_latents.device, image_latents.dtype)
)
image_latents = (image_latents - latents_mean) / latents_std
return image_latents
class QwenImageEditResizeDynamicStep(ModularPipelineBlocks):
model_name = "qwenimage"
def __init__(self, input_name: str = "image", output_name: str = "resized_image"):
"""Create a configurable step for resizing images to the target area (1024 * 1024) while maintaining the aspect ratio.
This block resizes an input image tensor and exposes the resized result under configurable input and output
names. Use this when you need to wire the resize step to different image fields (e.g., "image",
"control_image")
Args:
input_name (str, optional): Name of the image field to read from the
pipeline state. Defaults to "image".
output_name (str, optional): Name of the resized image field to write
back to the pipeline state. Defaults to "resized_image".
"""
if not isinstance(input_name, str) or not isinstance(output_name, str):
raise ValueError(
f"input_name and output_name must be strings but are {type(input_name)} and {type(output_name)}"
)
self._image_input_name = input_name
self._resized_image_output_name = output_name
super().__init__()
@property
def description(self) -> str:
return f"Image Resize step that resize the {self._image_input_name} to the target area (1024 * 1024) while maintaining the aspect ratio."
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec(
"image_resize_processor",
VaeImageProcessor,
config=FrozenDict({"vae_scale_factor": 16}),
default_creation_method="from_config",
),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(
name=self._image_input_name, required=True, type_hint=torch.Tensor, description="The image to resize"
),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name=self._resized_image_output_name, type_hint=List[PIL.Image.Image], description="The resized images"
),
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)
images = getattr(block_state, self._image_input_name)
if not is_valid_image_imagelist(images):
raise ValueError(f"Images must be image or list of images but are {type(images)}")
if is_valid_image(images):
images = [images]
image_width, image_height = images[0].size
calculated_width, calculated_height, _ = calculate_dimensions(1024 * 1024, image_width / image_height)
resized_images = [
components.image_resize_processor.resize(image, height=calculated_height, width=calculated_width)
for image in images
]
setattr(block_state, self._resized_image_output_name, resized_images)
self.set_block_state(state, block_state)
return components, state
class QwenImageTextEncoderStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return "Text Encoder step that generate text_embeddings to guide the image generation"
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec("text_encoder", Qwen2_5_VLForConditionalGeneration, description="The text encoder to use"),
ComponentSpec("tokenizer", Qwen2Tokenizer, description="The tokenizer to use"),
ComponentSpec(
"guider",
ClassifierFreeGuidance,
config=FrozenDict({"guidance_scale": 4.0}),
default_creation_method="from_config",
),
]
@property
def expected_configs(self) -> List[ConfigSpec]:
return [
ConfigSpec(
name="prompt_template_encode",
default="<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n",
),
ConfigSpec(name="prompt_template_encode_start_idx", default=34),
ConfigSpec(name="tokenizer_max_length", default=1024),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
InputParam(
name="max_sequence_length", type_hint=int, description="The max sequence length to use", default=1024
),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="prompt_embeds",
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="The prompt embeddings",
),
OutputParam(
name="prompt_embeds_mask",
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="The encoder attention mask",
),
OutputParam(
name="negative_prompt_embeds",
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="The negative prompt embeddings",
),
OutputParam(
name="negative_prompt_embeds_mask",
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="The negative prompt embeddings mask",
),
]
@staticmethod
def check_inputs(prompt, negative_prompt, max_sequence_length):
if not isinstance(prompt, str) and not isinstance(prompt, list):
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if (
negative_prompt is not None
and not isinstance(negative_prompt, str)
and not isinstance(negative_prompt, list)
):
raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
if max_sequence_length is not None and max_sequence_length > 1024:
raise ValueError(f"`max_sequence_length` cannot be greater than 1024 but is {max_sequence_length}")
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)
device = components._execution_device
self.check_inputs(block_state.prompt, block_state.negative_prompt, block_state.max_sequence_length)
block_state.prompt_embeds, block_state.prompt_embeds_mask = get_qwen_prompt_embeds(
components.text_encoder,
components.tokenizer,
prompt=block_state.prompt,
prompt_template_encode=components.config.prompt_template_encode,
prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
tokenizer_max_length=components.config.tokenizer_max_length,
device=device,
)
block_state.prompt_embeds = block_state.prompt_embeds[:, : block_state.max_sequence_length]
block_state.prompt_embeds_mask = block_state.prompt_embeds_mask[:, : block_state.max_sequence_length]
if components.requires_unconditional_embeds:
negative_prompt = block_state.negative_prompt or ""
block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds(
components.text_encoder,
components.tokenizer,
prompt=negative_prompt,
prompt_template_encode=components.config.prompt_template_encode,
prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
tokenizer_max_length=components.config.tokenizer_max_length,
device=device,
)
block_state.negative_prompt_embeds = block_state.negative_prompt_embeds[
:, : block_state.max_sequence_length
]
block_state.negative_prompt_embeds_mask = block_state.negative_prompt_embeds_mask[
:, : block_state.max_sequence_length
]
self.set_block_state(state, block_state)
return components, state
class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return "Text Encoder step that processes both prompt and image together to generate text embeddings for guiding image generation"
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec("text_encoder", Qwen2_5_VLForConditionalGeneration),
ComponentSpec("processor", Qwen2VLProcessor),
ComponentSpec(
"guider",
ClassifierFreeGuidance,
config=FrozenDict({"guidance_scale": 4.0}),
default_creation_method="from_config",
),
]
@property
def expected_configs(self) -> List[ConfigSpec]:
return [
ConfigSpec(
name="prompt_template_encode",
default="<|im_start|>system\nDescribe the key features of the input image (color, shape, size, texture, objects, background), then explain how the user's text instruction should alter or modify the image. Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>{}<|im_end|>\n<|im_start|>assistant\n",
),
ConfigSpec(name="prompt_template_encode_start_idx", default=64),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="prompt", required=True, type_hint=str, description="The prompt to encode"),
InputParam(name="negative_prompt", type_hint=str, description="The negative prompt to encode"),
InputParam(
name="resized_image",
required=True,
type_hint=torch.Tensor,
description="The image prompt to encode, should be resized using resize step",
),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
name="prompt_embeds",
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="The prompt embeddings",
),
OutputParam(
name="prompt_embeds_mask",
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="The encoder attention mask",
),
OutputParam(
name="negative_prompt_embeds",
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="The negative prompt embeddings",
),
OutputParam(
name="negative_prompt_embeds_mask",
kwargs_type="denoiser_input_fields",
type_hint=torch.Tensor,
description="The negative prompt embeddings mask",
),
]
@staticmethod
def check_inputs(prompt, negative_prompt):
if not isinstance(prompt, str) and not isinstance(prompt, list):
raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
if (
negative_prompt is not None
and not isinstance(negative_prompt, str)
and not isinstance(negative_prompt, list)
):
raise ValueError(f"`negative_prompt` has to be of type `str` or `list` but is {type(negative_prompt)}")
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)
self.check_inputs(block_state.prompt, block_state.negative_prompt)
device = components._execution_device
block_state.prompt_embeds, block_state.prompt_embeds_mask = get_qwen_prompt_embeds_edit(
components.text_encoder,
components.processor,
prompt=block_state.prompt,
image=block_state.resized_image,
prompt_template_encode=components.config.prompt_template_encode,
prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
device=device,
)
if components.requires_unconditional_embeds:
negative_prompt = block_state.negative_prompt or ""
block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds_edit(
components.text_encoder,
components.processor,
prompt=negative_prompt,
image=block_state.resized_image,
prompt_template_encode=components.config.prompt_template_encode,
prompt_template_encode_start_idx=components.config.prompt_template_encode_start_idx,
device=device,
)
self.set_block_state(state, block_state)
return components, state
class QwenImageInpaintProcessImagesInputStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return "Image Preprocess step for inpainting task. This processes the image and mask inputs together. Images can be resized first using QwenImageEditResizeDynamicStep."
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec(
"image_mask_processor",
InpaintProcessor,
config=FrozenDict({"vae_scale_factor": 16}),
default_creation_method="from_config",
),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("mask_image", required=True),
InputParam("resized_image"),
InputParam("image"),
InputParam("height"),
InputParam("width"),
InputParam("padding_mask_crop"),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(name="processed_image"),
OutputParam(name="processed_mask_image"),
OutputParam(
name="mask_overlay_kwargs",
type_hint=Dict,
description="The kwargs for the postprocess step to apply the mask overlay",
),
]
@staticmethod
def check_inputs(height, width, vae_scale_factor):
if height is not None and height % (vae_scale_factor * 2) != 0:
raise ValueError(f"Height must be divisible by {vae_scale_factor * 2} but is {height}")
if width is not None and width % (vae_scale_factor * 2) != 0:
raise ValueError(f"Width must be divisible by {vae_scale_factor * 2} but is {width}")
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)
if block_state.resized_image is None and block_state.image is None:
raise ValueError("resized_image and image cannot be None at the same time")
if block_state.resized_image is None:
image = block_state.image
self.check_inputs(
height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor
)
height = block_state.height or components.default_height
width = block_state.width or components.default_width
else:
width, height = block_state.resized_image[0].size
image = block_state.resized_image
block_state.processed_image, block_state.processed_mask_image, block_state.mask_overlay_kwargs = (
components.image_mask_processor.preprocess(
image=image,
mask=block_state.mask_image,
height=height,
width=width,
padding_mask_crop=block_state.padding_mask_crop,
)
)
self.set_block_state(state, block_state)
return components, state
class QwenImageProcessImagesInputStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return "Image Preprocess step. Images can be resized first using QwenImageEditResizeDynamicStep."
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec(
"image_processor",
VaeImageProcessor,
config=FrozenDict({"vae_scale_factor": 16}),
default_creation_method="from_config",
),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("resized_image"),
InputParam("image"),
InputParam("height"),
InputParam("width"),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(name="processed_image"),
]
@staticmethod
def check_inputs(height, width, vae_scale_factor):
if height is not None and height % (vae_scale_factor * 2) != 0:
raise ValueError(f"Height must be divisible by {vae_scale_factor * 2} but is {height}")
if width is not None and width % (vae_scale_factor * 2) != 0:
raise ValueError(f"Width must be divisible by {vae_scale_factor * 2} but is {width}")
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState):
block_state = self.get_block_state(state)
if block_state.resized_image is None and block_state.image is None:
raise ValueError("resized_image and image cannot be None at the same time")
if block_state.resized_image is None:
image = block_state.image
self.check_inputs(
height=block_state.height, width=block_state.width, vae_scale_factor=components.vae_scale_factor
)
height = block_state.height or components.default_height
width = block_state.width or components.default_width
else:
width, height = block_state.resized_image[0].size
image = block_state.resized_image
block_state.processed_image = components.image_processor.preprocess(
image=image,
height=height,
width=width,
)
self.set_block_state(state, block_state)
return components, state
class QwenImageVaeEncoderDynamicStep(ModularPipelineBlocks):
model_name = "qwenimage"
def __init__(
self,
input_name: str = "processed_image",
output_name: str = "image_latents",
):
"""Initialize a VAE encoder step for converting images to latent representations.
Both the input and output names are configurable so this block can be configured to process to different image
inputs (e.g., "processed_image" -> "image_latents", "processed_control_image" -> "control_image_latents").
Args:
input_name (str, optional): Name of the input image tensor. Defaults to "processed_image".
Examples: "processed_image" or "processed_control_image"
output_name (str, optional): Name of the output latent tensor. Defaults to "image_latents".
Examples: "image_latents" or "control_image_latents"
Examples:
# Basic usage with default settings (includes image processor) QwenImageVaeEncoderDynamicStep()
# Custom input/output names for control image QwenImageVaeEncoderDynamicStep(
input_name="processed_control_image", output_name="control_image_latents"
)
"""
self._image_input_name = input_name
self._image_latents_output_name = output_name
super().__init__()
@property
def description(self) -> str:
return f"Dynamic VAE Encoder step that converts {self._image_input_name} into latent representations {self._image_latents_output_name}.\n"
@property
def expected_components(self) -> List[ComponentSpec]:
components = [
ComponentSpec("vae", AutoencoderKLQwenImage),
]
return components
@property
def inputs(self) -> List[InputParam]:
inputs = [
InputParam(self._image_input_name, required=True),
InputParam("generator"),
]
return inputs
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
self._image_latents_output_name,
type_hint=torch.Tensor,
description="The latents representing the reference image",
)
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
device = components._execution_device
dtype = components.vae.dtype
image = getattr(block_state, self._image_input_name)
# Encode image into latents
image_latents = encode_vae_image(
image=image,
vae=components.vae,
generator=block_state.generator,
device=device,
dtype=dtype,
latent_channels=components.num_channels_latents,
)
setattr(block_state, self._image_latents_output_name, image_latents)
self.set_block_state(state, block_state)
return components, state
class QwenImageControlNetVaeEncoderStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return "VAE Encoder step that converts `control_image` into latent representations control_image_latents.\n"
@property
def expected_components(self) -> List[ComponentSpec]:
components = [
ComponentSpec("vae", AutoencoderKLQwenImage),
ComponentSpec("controlnet", QwenImageControlNetModel),
ComponentSpec(
"control_image_processor",
VaeImageProcessor,
config=FrozenDict({"vae_scale_factor": 16}),
default_creation_method="from_config",
),
]
return components
@property
def inputs(self) -> List[InputParam]:
inputs = [
InputParam("control_image", required=True),
InputParam("height"),
InputParam("width"),
InputParam("generator"),
]
return inputs
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
"control_image_latents",
type_hint=torch.Tensor,
description="The latents representing the control image",
)
]
@staticmethod
def check_inputs(height, width, vae_scale_factor):
if height is not None and height % (vae_scale_factor * 2) != 0:
raise ValueError(f"Height must be divisible by {vae_scale_factor * 2} but is {height}")
if width is not None and width % (vae_scale_factor * 2) != 0:
raise ValueError(f"Width must be divisible by {vae_scale_factor * 2} but is {width}")
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
self.check_inputs(block_state.height, block_state.width, components.vae_scale_factor)
device = components._execution_device
dtype = components.vae.dtype
height = block_state.height or components.default_height
width = block_state.width or components.default_width
controlnet = unwrap_module(components.controlnet)
if isinstance(controlnet, QwenImageMultiControlNetModel) and not isinstance(block_state.control_image, list):
block_state.control_image = [block_state.control_image]
if isinstance(controlnet, QwenImageMultiControlNetModel):
block_state.control_image_latents = []
for control_image_ in block_state.control_image:
control_image_ = components.control_image_processor.preprocess(
image=control_image_,
height=height,
width=width,
)
control_image_latents_ = encode_vae_image(
image=control_image_,
vae=components.vae,
generator=block_state.generator,
device=device,
dtype=dtype,
latent_channels=components.num_channels_latents,
sample_mode="sample",
)
block_state.control_image_latents.append(control_image_latents_)
elif isinstance(controlnet, QwenImageControlNetModel):
control_image = components.control_image_processor.preprocess(
image=block_state.control_image,
height=height,
width=width,
)
block_state.control_image_latents = encode_vae_image(
image=control_image,
vae=components.vae,
generator=block_state.generator,
device=device,
dtype=dtype,
latent_channels=components.num_channels_latents,
sample_mode="sample",
)
else:
raise ValueError(
f"Expected controlnet to be a QwenImageControlNetModel or QwenImageMultiControlNetModel, got {type(controlnet)}"
)
self.set_block_state(state, block_state)
return components, state
@@ -0,0 +1,431 @@
# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List, Tuple
import torch
from ...models import QwenImageMultiControlNetModel
from ..modular_pipeline import ModularPipelineBlocks, PipelineState
from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
from .modular_pipeline import QwenImageModularPipeline, QwenImagePachifier
def repeat_tensor_to_batch_size(
input_name: str,
input_tensor: torch.Tensor,
batch_size: int,
num_images_per_prompt: int = 1,
) -> torch.Tensor:
"""Repeat tensor elements to match the final batch size.
This function expands a tensor's batch dimension to match the final batch size (batch_size * num_images_per_prompt)
by repeating each element along dimension 0.
The input tensor must have batch size 1 or batch_size. The function will:
- If batch size is 1: repeat each element (batch_size * num_images_per_prompt) times
- If batch size equals batch_size: repeat each element num_images_per_prompt times
Args:
input_name (str): Name of the input tensor (used for error messages)
input_tensor (torch.Tensor): The tensor to repeat. Must have batch size 1 or batch_size.
batch_size (int): The base batch size (number of prompts)
num_images_per_prompt (int, optional): Number of images to generate per prompt. Defaults to 1.
Returns:
torch.Tensor: The repeated tensor with final batch size (batch_size * num_images_per_prompt)
Raises:
ValueError: If input_tensor is not a torch.Tensor or has invalid batch size
Examples:
tensor = torch.tensor([[1, 2, 3]]) # shape: [1, 3] repeated = repeat_tensor_to_batch_size("image", tensor,
batch_size=2, num_images_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [1, 2, 3], [1, 2, 3]]) - shape:
[4, 3]
tensor = torch.tensor([[1, 2, 3], [4, 5, 6]]) # shape: [2, 3] repeated = repeat_tensor_to_batch_size("image",
tensor, batch_size=2, num_images_per_prompt=2) repeated # tensor([[1, 2, 3], [1, 2, 3], [4, 5, 6], [4, 5, 6]])
- shape: [4, 3]
"""
# make sure input is a tensor
if not isinstance(input_tensor, torch.Tensor):
raise ValueError(f"`{input_name}` must be a tensor")
# make sure input tensor e.g. image_latents has batch size 1 or batch_size same as prompts
if input_tensor.shape[0] == 1:
repeat_by = batch_size * num_images_per_prompt
elif input_tensor.shape[0] == batch_size:
repeat_by = num_images_per_prompt
else:
raise ValueError(
f"`{input_name}` must have have batch size 1 or {batch_size}, but got {input_tensor.shape[0]}"
)
# expand the tensor to match the batch_size * num_images_per_prompt
input_tensor = input_tensor.repeat_interleave(repeat_by, dim=0)
return input_tensor
def calculate_dimension_from_latents(latents: torch.Tensor, vae_scale_factor: int) -> Tuple[int, int]:
"""Calculate image dimensions from latent tensor dimensions.
This function converts latent space dimensions to image space dimensions by multiplying the latent height and width
by the VAE scale factor.
Args:
latents (torch.Tensor): The latent tensor. Must have 4 or 5 dimensions.
Expected shapes: [batch, channels, height, width] or [batch, channels, frames, height, width]
vae_scale_factor (int): The scale factor used by the VAE to compress images.
Typically 8 for most VAEs (image is 8x larger than latents in each dimension)
Returns:
Tuple[int, int]: The calculated image dimensions as (height, width)
Raises:
ValueError: If latents tensor doesn't have 4 or 5 dimensions
"""
# make sure the latents are not packed
if latents.ndim != 4 and latents.ndim != 5:
raise ValueError(f"unpacked latents must have 4 or 5 dimensions, but got {latents.ndim}")
latent_height, latent_width = latents.shape[-2:]
height = latent_height * vae_scale_factor
width = latent_width * vae_scale_factor
return height, width
class QwenImageTextInputsStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
summary_section = (
"Text input processing step that standardizes text embeddings for the pipeline.\n"
"This step:\n"
" 1. Determines `batch_size` and `dtype` based on `prompt_embeds`\n"
" 2. Ensures all text embeddings have consistent batch sizes (batch_size * num_images_per_prompt)"
)
# Placement guidance
placement_section = "\n\nThis block should be placed after all encoder steps to process the text embeddings before they are used in subsequent pipeline steps."
return summary_section + placement_section
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="num_images_per_prompt", default=1),
InputParam(name="prompt_embeds", required=True, kwargs_type="denoiser_input_fields"),
InputParam(name="prompt_embeds_mask", required=True, kwargs_type="denoiser_input_fields"),
InputParam(name="negative_prompt_embeds", kwargs_type="denoiser_input_fields"),
InputParam(name="negative_prompt_embeds_mask", kwargs_type="denoiser_input_fields"),
]
@property
def intermediate_outputs(self) -> List[str]:
return [
OutputParam(
"batch_size",
type_hint=int,
description="Number of prompts, the final batch size of model inputs should be batch_size * num_images_per_prompt",
),
OutputParam(
"dtype",
type_hint=torch.dtype,
description="Data type of model tensor inputs (determined by `prompt_embeds`)",
),
]
@staticmethod
def check_inputs(
prompt_embeds,
prompt_embeds_mask,
negative_prompt_embeds,
negative_prompt_embeds_mask,
):
if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None:
raise ValueError("`negative_prompt_embeds_mask` is required when `negative_prompt_embeds` is not None")
if negative_prompt_embeds is None and negative_prompt_embeds_mask is not None:
raise ValueError("cannot pass `negative_prompt_embeds_mask` without `negative_prompt_embeds`")
if prompt_embeds_mask.shape[0] != prompt_embeds.shape[0]:
raise ValueError("`prompt_embeds_mask` must have the same batch size as `prompt_embeds`")
elif negative_prompt_embeds is not None and negative_prompt_embeds.shape[0] != prompt_embeds.shape[0]:
raise ValueError("`negative_prompt_embeds` must have the same batch size as `prompt_embeds`")
elif (
negative_prompt_embeds_mask is not None and negative_prompt_embeds_mask.shape[0] != prompt_embeds.shape[0]
):
raise ValueError("`negative_prompt_embeds_mask` must have the same batch size as `prompt_embeds`")
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
self.check_inputs(
prompt_embeds=block_state.prompt_embeds,
prompt_embeds_mask=block_state.prompt_embeds_mask,
negative_prompt_embeds=block_state.negative_prompt_embeds,
negative_prompt_embeds_mask=block_state.negative_prompt_embeds_mask,
)
block_state.batch_size = block_state.prompt_embeds.shape[0]
block_state.dtype = block_state.prompt_embeds.dtype
_, seq_len, _ = block_state.prompt_embeds.shape
block_state.prompt_embeds = block_state.prompt_embeds.repeat(1, block_state.num_images_per_prompt, 1)
block_state.prompt_embeds = block_state.prompt_embeds.view(
block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
)
block_state.prompt_embeds_mask = block_state.prompt_embeds_mask.repeat(1, block_state.num_images_per_prompt, 1)
block_state.prompt_embeds_mask = block_state.prompt_embeds_mask.view(
block_state.batch_size * block_state.num_images_per_prompt, seq_len
)
if block_state.negative_prompt_embeds is not None:
_, seq_len, _ = block_state.negative_prompt_embeds.shape
block_state.negative_prompt_embeds = block_state.negative_prompt_embeds.repeat(
1, block_state.num_images_per_prompt, 1
)
block_state.negative_prompt_embeds = block_state.negative_prompt_embeds.view(
block_state.batch_size * block_state.num_images_per_prompt, seq_len, -1
)
block_state.negative_prompt_embeds_mask = block_state.negative_prompt_embeds_mask.repeat(
1, block_state.num_images_per_prompt, 1
)
block_state.negative_prompt_embeds_mask = block_state.negative_prompt_embeds_mask.view(
block_state.batch_size * block_state.num_images_per_prompt, seq_len
)
self.set_block_state(state, block_state)
return components, state
class QwenImageInputsDynamicStep(ModularPipelineBlocks):
model_name = "qwenimage"
def __init__(
self,
image_latent_inputs: List[str] = ["image_latents"],
additional_batch_inputs: List[str] = [],
):
"""Initialize a configurable step that standardizes the inputs for the denoising step. It:\n"
This step handles multiple common tasks to prepare inputs for the denoising step:
1. For encoded image latents, use it update height/width if None, patchifies, and expands batch size
2. For additional_batch_inputs: Only expands batch dimensions to match final batch size
This is a dynamic block that allows you to configure which inputs to process.
Args:
image_latent_inputs (List[str], optional): Names of image latent tensors to process.
These will be used to determine height/width, patchified, and batch-expanded. Can be a single string or
list of strings. Defaults to ["image_latents"]. Examples: ["image_latents"], ["control_image_latents"]
additional_batch_inputs (List[str], optional):
Names of additional conditional input tensors to expand batch size. These tensors will only have their
batch dimensions adjusted to match the final batch size. Can be a single string or list of strings.
Defaults to []. Examples: ["processed_mask_image"]
Examples:
# Configure to process image_latents (default behavior) QwenImageInputsDynamicStep()
# Configure to process multiple image latent inputs
QwenImageInputsDynamicStep(image_latent_inputs=["image_latents", "control_image_latents"])
# Configure to process image latents and additional batch inputs QwenImageInputsDynamicStep(
image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
)
"""
if not isinstance(image_latent_inputs, list):
image_latent_inputs = [image_latent_inputs]
if not isinstance(additional_batch_inputs, list):
additional_batch_inputs = [additional_batch_inputs]
self._image_latent_inputs = image_latent_inputs
self._additional_batch_inputs = additional_batch_inputs
super().__init__()
@property
def description(self) -> str:
# Functionality section
summary_section = (
"Input processing step that:\n"
" 1. For image latent inputs: Updates height/width if None, patchifies latents, and expands batch size\n"
" 2. For additional batch inputs: Expands batch dimensions to match final batch size"
)
# Inputs info
inputs_info = ""
if self._image_latent_inputs or self._additional_batch_inputs:
inputs_info = "\n\nConfigured inputs:"
if self._image_latent_inputs:
inputs_info += f"\n - Image latent inputs: {self._image_latent_inputs}"
if self._additional_batch_inputs:
inputs_info += f"\n - Additional batch inputs: {self._additional_batch_inputs}"
# Placement guidance
placement_section = "\n\nThis block should be placed after the encoder steps and the text input step."
return summary_section + inputs_info + placement_section
@property
def inputs(self) -> List[InputParam]:
inputs = [
InputParam(name="num_images_per_prompt", default=1),
InputParam(name="batch_size", required=True),
InputParam(name="height"),
InputParam(name="width"),
]
# Add image latent inputs
for image_latent_input_name in self._image_latent_inputs:
inputs.append(InputParam(name=image_latent_input_name))
# Add additional batch inputs
for input_name in self._additional_batch_inputs:
inputs.append(InputParam(name=input_name))
return inputs
@property
def expected_components(self) -> List[ComponentSpec]:
return [
ComponentSpec("pachifier", QwenImagePachifier, default_creation_method="from_config"),
]
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
# Process image latent inputs (height/width calculation, patchify, and batch expansion)
for image_latent_input_name in self._image_latent_inputs:
image_latent_tensor = getattr(block_state, image_latent_input_name)
if image_latent_tensor is None:
continue
# 1. Calculate height/width from latents
height, width = calculate_dimension_from_latents(image_latent_tensor, components.vae_scale_factor)
block_state.height = block_state.height or height
block_state.width = block_state.width or width
# 2. Patchify the image latent tensor
image_latent_tensor = components.pachifier.pack_latents(image_latent_tensor)
# 3. Expand batch size
image_latent_tensor = repeat_tensor_to_batch_size(
input_name=image_latent_input_name,
input_tensor=image_latent_tensor,
num_images_per_prompt=block_state.num_images_per_prompt,
batch_size=block_state.batch_size,
)
setattr(block_state, image_latent_input_name, image_latent_tensor)
# Process additional batch inputs (only batch expansion)
for input_name in self._additional_batch_inputs:
input_tensor = getattr(block_state, input_name)
if input_tensor is None:
continue
# Only expand batch size
input_tensor = repeat_tensor_to_batch_size(
input_name=input_name,
input_tensor=input_tensor,
num_images_per_prompt=block_state.num_images_per_prompt,
batch_size=block_state.batch_size,
)
setattr(block_state, input_name, input_tensor)
self.set_block_state(state, block_state)
return components, state
class QwenImageControlNetInputsStep(ModularPipelineBlocks):
model_name = "qwenimage"
@property
def description(self) -> str:
return "prepare the `control_image_latents` for controlnet. Insert after all the other inputs steps."
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(name="control_image_latents", required=True),
InputParam(name="batch_size", required=True),
InputParam(name="num_images_per_prompt", default=1),
InputParam(name="height"),
InputParam(name="width"),
]
@torch.no_grad()
def __call__(self, components: QwenImageModularPipeline, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
if isinstance(components.controlnet, QwenImageMultiControlNetModel):
control_image_latents = []
# loop through each control_image_latents
for i, control_image_latents_ in enumerate(block_state.control_image_latents):
# 1. update height/width if not provided
height, width = calculate_dimension_from_latents(control_image_latents_, components.vae_scale_factor)
block_state.height = block_state.height or height
block_state.width = block_state.width or width
# 2. pack
control_image_latents_ = components.pachifier.pack_latents(control_image_latents_)
# 3. repeat to match the batch size
control_image_latents_ = repeat_tensor_to_batch_size(
input_name=f"control_image_latents[{i}]",
input_tensor=control_image_latents_,
num_images_per_prompt=block_state.num_images_per_prompt,
batch_size=block_state.batch_size,
)
control_image_latents.append(control_image_latents_)
block_state.control_image_latents = control_image_latents
else:
# 1. update height/width if not provided
height, width = calculate_dimension_from_latents(
block_state.control_image_latents, components.vae_scale_factor
)
block_state.height = block_state.height or height
block_state.width = block_state.width or width
# 2. pack
block_state.control_image_latents = components.pachifier.pack_latents(block_state.control_image_latents)
# 3. repeat to match the batch size
block_state.control_image_latents = repeat_tensor_to_batch_size(
input_name="control_image_latents",
input_tensor=block_state.control_image_latents,
num_images_per_prompt=block_state.num_images_per_prompt,
batch_size=block_state.batch_size,
)
block_state.control_image_latents = block_state.control_image_latents
self.set_block_state(state, block_state)
return components, state
@@ -0,0 +1,841 @@
# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ...utils import logging
from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
from ..modular_pipeline_utils import InsertableDict
from .before_denoise import (
QwenImageControlNetBeforeDenoiserStep,
QwenImageCreateMaskLatentsStep,
QwenImageEditRoPEInputsStep,
QwenImagePrepareLatentsStep,
QwenImagePrepareLatentsWithStrengthStep,
QwenImageRoPEInputsStep,
QwenImageSetTimestepsStep,
QwenImageSetTimestepsWithStrengthStep,
)
from .decoders import QwenImageDecoderStep, QwenImageInpaintProcessImagesOutputStep, QwenImageProcessImagesOutputStep
from .denoise import (
QwenImageControlNetDenoiseStep,
QwenImageDenoiseStep,
QwenImageEditDenoiseStep,
QwenImageEditInpaintDenoiseStep,
QwenImageInpaintControlNetDenoiseStep,
QwenImageInpaintDenoiseStep,
QwenImageLoopBeforeDenoiserControlNet,
)
from .encoders import (
QwenImageControlNetVaeEncoderStep,
QwenImageEditResizeDynamicStep,
QwenImageEditTextEncoderStep,
QwenImageInpaintProcessImagesInputStep,
QwenImageProcessImagesInputStep,
QwenImageTextEncoderStep,
QwenImageVaeEncoderDynamicStep,
)
from .inputs import QwenImageControlNetInputsStep, QwenImageInputsDynamicStep, QwenImageTextInputsStep
logger = logging.get_logger(__name__)
# 1. QwenImage
## 1.1 QwenImage/text2image
#### QwenImage/decode
#### (standard decode step works for most tasks except for inpaint)
QwenImageDecodeBlocks = InsertableDict(
[
("decode", QwenImageDecoderStep()),
("postprocess", QwenImageProcessImagesOutputStep()),
]
)
class QwenImageDecodeStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageDecodeBlocks.values()
block_names = QwenImageDecodeBlocks.keys()
@property
def description(self):
return "Decode step that decodes the latents to images and postprocess the generated image."
#### QwenImage/text2image presets
TEXT2IMAGE_BLOCKS = InsertableDict(
[
("text_encoder", QwenImageTextEncoderStep()),
("input", QwenImageTextInputsStep()),
("prepare_latents", QwenImagePrepareLatentsStep()),
("set_timesteps", QwenImageSetTimestepsStep()),
("prepare_rope_inputs", QwenImageRoPEInputsStep()),
("denoise", QwenImageDenoiseStep()),
("decode", QwenImageDecodeStep()),
]
)
## 1.2 QwenImage/inpaint
#### QwenImage/inpaint vae encoder
QwenImageInpaintVaeEncoderBlocks = InsertableDict(
[
(
"preprocess",
QwenImageInpaintProcessImagesInputStep,
), # image, mask_image -> processed_image, processed_mask_image, mask_overlay_kwargs
("encode", QwenImageVaeEncoderDynamicStep()), # processed_image -> image_latents
]
)
class QwenImageInpaintVaeEncoderStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageInpaintVaeEncoderBlocks.values()
block_names = QwenImageInpaintVaeEncoderBlocks.keys()
@property
def description(self) -> str:
return (
"This step is used for processing image and mask inputs for inpainting tasks. It:\n"
" - Resizes the image to the target size, based on `height` and `width`.\n"
" - Processes and updates `image` and `mask_image`.\n"
" - Creates `image_latents`."
)
#### QwenImage/inpaint inputs
QwenImageInpaintInputBlocks = InsertableDict(
[
("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings
(
"additional_inputs",
QwenImageInputsDynamicStep(
image_latent_inputs=["image_latents"], additional_batch_inputs=["processed_mask_image"]
),
),
]
)
class QwenImageInpaintInputStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageInpaintInputBlocks.values()
block_names = QwenImageInpaintInputBlocks.keys()
@property
def description(self):
return "Input step that prepares the inputs for the inpainting denoising step. It:\n"
" - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents` and `processed_mask_image`).\n"
" - update height/width based `image_latents`, patchify `image_latents`."
# QwenImage/inpaint prepare latents
QwenImageInpaintPrepareLatentsBlocks = InsertableDict(
[
("add_noise_to_latents", QwenImagePrepareLatentsWithStrengthStep()),
("create_mask_latents", QwenImageCreateMaskLatentsStep()),
]
)
class QwenImageInpaintPrepareLatentsStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageInpaintPrepareLatentsBlocks.values()
block_names = QwenImageInpaintPrepareLatentsBlocks.keys()
@property
def description(self) -> str:
return (
"This step prepares the latents/image_latents and mask inputs for the inpainting denoising step. It:\n"
" - Add noise to the image latents to create the latents input for the denoiser.\n"
" - Create the pachified latents `mask` based on the processedmask image.\n"
)
#### QwenImage/inpaint decode
QwenImageInpaintDecodeBlocks = InsertableDict(
[
("decode", QwenImageDecoderStep()),
("postprocess", QwenImageInpaintProcessImagesOutputStep()),
]
)
class QwenImageInpaintDecodeStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageInpaintDecodeBlocks.values()
block_names = QwenImageInpaintDecodeBlocks.keys()
@property
def description(self):
return "Decode step that decodes the latents to images and postprocess the generated image, optional apply the mask overally to the original image."
#### QwenImage/inpaint presets
INPAINT_BLOCKS = InsertableDict(
[
("text_encoder", QwenImageTextEncoderStep()),
("vae_encoder", QwenImageInpaintVaeEncoderStep()),
("input", QwenImageInpaintInputStep()),
("prepare_latents", QwenImagePrepareLatentsStep()),
("set_timesteps", QwenImageSetTimestepsWithStrengthStep()),
("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()),
("prepare_rope_inputs", QwenImageRoPEInputsStep()),
("denoise", QwenImageInpaintDenoiseStep()),
("decode", QwenImageInpaintDecodeStep()),
]
)
## 1.3 QwenImage/img2img
#### QwenImage/img2img vae encoder
QwenImageImg2ImgVaeEncoderBlocks = InsertableDict(
[
("preprocess", QwenImageProcessImagesInputStep()),
("encode", QwenImageVaeEncoderDynamicStep()),
]
)
class QwenImageImg2ImgVaeEncoderStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageImg2ImgVaeEncoderBlocks.values()
block_names = QwenImageImg2ImgVaeEncoderBlocks.keys()
@property
def description(self) -> str:
return "Vae encoder step that preprocess andencode the image inputs into their latent representations."
#### QwenImage/img2img inputs
QwenImageImg2ImgInputBlocks = InsertableDict(
[
("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings
("additional_inputs", QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])),
]
)
class QwenImageImg2ImgInputStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageImg2ImgInputBlocks.values()
block_names = QwenImageImg2ImgInputBlocks.keys()
@property
def description(self):
return "Input step that prepares the inputs for the img2img denoising step. It:\n"
" - make sure the text embeddings have consistent batch size as well as the additional inputs (`image_latents`).\n"
" - update height/width based `image_latents`, patchify `image_latents`."
#### QwenImage/img2img presets
IMAGE2IMAGE_BLOCKS = InsertableDict(
[
("text_encoder", QwenImageTextEncoderStep()),
("vae_encoder", QwenImageImg2ImgVaeEncoderStep()),
("input", QwenImageImg2ImgInputStep()),
("prepare_latents", QwenImagePrepareLatentsStep()),
("set_timesteps", QwenImageSetTimestepsWithStrengthStep()),
("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()),
("prepare_rope_inputs", QwenImageRoPEInputsStep()),
("denoise", QwenImageDenoiseStep()),
("decode", QwenImageDecodeStep()),
]
)
## 1.4 QwenImage/controlnet
#### QwenImage/controlnet presets
CONTROLNET_BLOCKS = InsertableDict(
[
("controlnet_vae_encoder", QwenImageControlNetVaeEncoderStep()), # vae encoder step for control_image
("controlnet_inputs", QwenImageControlNetInputsStep()), # additional input step for controlnet
(
"controlnet_before_denoise",
QwenImageControlNetBeforeDenoiserStep(),
), # before denoise step (after set_timesteps step)
(
"controlnet_denoise_loop_before",
QwenImageLoopBeforeDenoiserControlNet(),
), # controlnet loop step (insert before the denoiseloop_denoiser)
]
)
## 1.5 QwenImage/auto encoders
#### for inpaint and img2img tasks
class QwenImageAutoVaeEncoderStep(AutoPipelineBlocks):
block_classes = [QwenImageInpaintVaeEncoderStep, QwenImageImg2ImgVaeEncoderStep]
block_names = ["inpaint", "img2img"]
block_trigger_inputs = ["mask_image", "image"]
@property
def description(self):
return (
"Vae encoder step that encode the image inputs into their latent representations.\n"
+ "This is an auto pipeline block.\n"
+ " - `QwenImageInpaintVaeEncoderStep` (inpaint) is used when `mask_image` is provided.\n"
+ " - `QwenImageImg2ImgVaeEncoderStep` (img2img) is used when `image` is provided.\n"
+ " - if `mask_image` or `image` is not provided, step will be skipped."
)
# for controlnet tasks
class QwenImageOptionalControlNetVaeEncoderStep(AutoPipelineBlocks):
block_classes = [QwenImageControlNetVaeEncoderStep]
block_names = ["controlnet"]
block_trigger_inputs = ["control_image"]
@property
def description(self):
return (
"Vae encoder step that encode the image inputs into their latent representations.\n"
+ "This is an auto pipeline block.\n"
+ " - `QwenImageControlNetVaeEncoderStep` (controlnet) is used when `control_image` is provided.\n"
+ " - if `control_image` is not provided, step will be skipped."
)
## 1.6 QwenImage/auto inputs
# text2image/inpaint/img2img
class QwenImageAutoInputStep(AutoPipelineBlocks):
block_classes = [QwenImageInpaintInputStep, QwenImageImg2ImgInputStep, QwenImageTextInputsStep]
block_names = ["inpaint", "img2img", "text2image"]
block_trigger_inputs = ["processed_mask_image", "image_latents", None]
@property
def description(self):
return (
"Input step that standardize the inputs for the denoising step, e.g. make sure inputs have consistent batch size, and patchified. \n"
" This is an auto pipeline block that works for text2image/inpaint/img2img tasks.\n"
+ " - `QwenImageInpaintInputStep` (inpaint) is used when `processed_mask_image` is provided.\n"
+ " - `QwenImageImg2ImgInputStep` (img2img) is used when `image_latents` is provided.\n"
+ " - `QwenImageTextInputsStep` (text2image) is used when both `processed_mask_image` and `image_latents` are not provided.\n"
)
# controlnet
class QwenImageOptionalControlNetInputStep(AutoPipelineBlocks):
block_classes = [QwenImageControlNetInputsStep]
block_names = ["controlnet"]
block_trigger_inputs = ["control_image_latents"]
@property
def description(self):
return (
"Controlnet input step that prepare the control_image_latents input.\n"
+ "This is an auto pipeline block.\n"
+ " - `QwenImageControlNetInputsStep` (controlnet) is used when `control_image_latents` is provided.\n"
+ " - if `control_image_latents` is not provided, step will be skipped."
)
## 1.7 QwenImage/auto before denoise step
# compose the steps into a BeforeDenoiseStep for text2image/img2img/inpaint tasks before combine into an auto step
# QwenImage/text2image before denoise
QwenImageText2ImageBeforeDenoiseBlocks = InsertableDict(
[
("prepare_latents", QwenImagePrepareLatentsStep()),
("set_timesteps", QwenImageSetTimestepsStep()),
("prepare_rope_inputs", QwenImageRoPEInputsStep()),
]
)
class QwenImageText2ImageBeforeDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageText2ImageBeforeDenoiseBlocks.values()
block_names = QwenImageText2ImageBeforeDenoiseBlocks.keys()
@property
def description(self):
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for text2image task."
# QwenImage/inpaint before denoise
QwenImageInpaintBeforeDenoiseBlocks = InsertableDict(
[
("prepare_latents", QwenImagePrepareLatentsStep()),
("set_timesteps", QwenImageSetTimestepsWithStrengthStep()),
("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()),
("prepare_rope_inputs", QwenImageRoPEInputsStep()),
]
)
class QwenImageInpaintBeforeDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageInpaintBeforeDenoiseBlocks.values()
block_names = QwenImageInpaintBeforeDenoiseBlocks.keys()
@property
def description(self):
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for inpaint task."
# QwenImage/img2img before denoise
QwenImageImg2ImgBeforeDenoiseBlocks = InsertableDict(
[
("prepare_latents", QwenImagePrepareLatentsStep()),
("set_timesteps", QwenImageSetTimestepsWithStrengthStep()),
("prepare_img2img_latents", QwenImagePrepareLatentsWithStrengthStep()),
("prepare_rope_inputs", QwenImageRoPEInputsStep()),
]
)
class QwenImageImg2ImgBeforeDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageImg2ImgBeforeDenoiseBlocks.values()
block_names = QwenImageImg2ImgBeforeDenoiseBlocks.keys()
@property
def description(self):
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for img2img task."
# auto before_denoise step for text2image, inpaint, img2img tasks
class QwenImageAutoBeforeDenoiseStep(AutoPipelineBlocks):
block_classes = [
QwenImageInpaintBeforeDenoiseStep,
QwenImageImg2ImgBeforeDenoiseStep,
QwenImageText2ImageBeforeDenoiseStep,
]
block_names = ["inpaint", "img2img", "text2image"]
block_trigger_inputs = ["processed_mask_image", "image_latents", None]
@property
def description(self):
return (
"Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n"
+ "This is an auto pipeline block that works for text2img, inpainting, img2img tasks.\n"
+ " - `QwenImageInpaintBeforeDenoiseStep` (inpaint) is used when `processed_mask_image` is provided.\n"
+ " - `QwenImageImg2ImgBeforeDenoiseStep` (img2img) is used when `image_latents` is provided.\n"
+ " - `QwenImageText2ImageBeforeDenoiseStep` (text2image) is used when both `processed_mask_image` and `image_latents` are not provided.\n"
)
# auto before_denoise step for controlnet tasks
class QwenImageOptionalControlNetBeforeDenoiseStep(AutoPipelineBlocks):
block_classes = [QwenImageControlNetBeforeDenoiserStep]
block_names = ["controlnet"]
block_trigger_inputs = ["control_image_latents"]
@property
def description(self):
return (
"Controlnet before denoise step that prepare the controlnet input.\n"
+ "This is an auto pipeline block.\n"
+ " - `QwenImageControlNetBeforeDenoiserStep` (controlnet) is used when `control_image_latents` is provided.\n"
+ " - if `control_image_latents` is not provided, step will be skipped."
)
## 1.8 QwenImage/auto denoise
# auto denoise step for controlnet tasks: works for all tasks with controlnet
class QwenImageControlNetAutoDenoiseStep(AutoPipelineBlocks):
block_classes = [QwenImageInpaintControlNetDenoiseStep, QwenImageControlNetDenoiseStep]
block_names = ["inpaint_denoise", "denoise"]
block_trigger_inputs = ["mask", None]
@property
def description(self):
return (
"Controlnet step during the denoising process. \n"
" This is an auto pipeline block that works for inpaint and text2image/img2img tasks with controlnet.\n"
+ " - `QwenImageInpaintControlNetDenoiseStep` (inpaint) is used when `mask` is provided.\n"
+ " - `QwenImageControlNetDenoiseStep` (text2image/img2img) is used when `mask` is not provided.\n"
)
# auto denoise step for everything: works for all tasks with or without controlnet
class QwenImageAutoDenoiseStep(AutoPipelineBlocks):
block_classes = [
QwenImageControlNetAutoDenoiseStep,
QwenImageInpaintDenoiseStep,
QwenImageDenoiseStep,
]
block_names = ["controlnet_denoise", "inpaint_denoise", "denoise"]
block_trigger_inputs = ["control_image_latents", "mask", None]
@property
def description(self):
return (
"Denoise step that iteratively denoise the latents. \n"
" This is an auto pipeline block that works for inpaint/text2image/img2img tasks. It also works with controlnet\n"
+ " - `QwenImageControlNetAutoDenoiseStep` (controlnet) is used when `control_image_latents` is provided.\n"
+ " - `QwenImageInpaintDenoiseStep` (inpaint) is used when `mask` is provided and `control_image_latents` is not provided.\n"
+ " - `QwenImageDenoiseStep` (text2image/img2img) is used when `mask` is not provided and `control_image_latents` is not provided.\n"
)
## 1.9 QwenImage/auto decode
# auto decode step for inpaint and text2image tasks
class QwenImageAutoDecodeStep(AutoPipelineBlocks):
block_classes = [QwenImageInpaintDecodeStep, QwenImageDecodeStep]
block_names = ["inpaint_decode", "decode"]
block_trigger_inputs = ["mask", None]
@property
def description(self):
return (
"Decode step that decode the latents into images. \n"
" This is an auto pipeline block that works for inpaint/text2image/img2img tasks, for both QwenImage and QwenImage-Edit.\n"
+ " - `QwenImageInpaintDecodeStep` (inpaint) is used when `mask` is provided.\n"
+ " - `QwenImageDecodeStep` (text2image/img2img) is used when `mask` is not provided.\n"
)
## 1.10 QwenImage/auto block & presets
AUTO_BLOCKS = InsertableDict(
[
("text_encoder", QwenImageTextEncoderStep()),
("vae_encoder", QwenImageAutoVaeEncoderStep()),
("controlnet_vae_encoder", QwenImageOptionalControlNetVaeEncoderStep()),
("input", QwenImageAutoInputStep()),
("controlnet_input", QwenImageOptionalControlNetInputStep()),
("before_denoise", QwenImageAutoBeforeDenoiseStep()),
("controlnet_before_denoise", QwenImageOptionalControlNetBeforeDenoiseStep()),
("denoise", QwenImageAutoDenoiseStep()),
("decode", QwenImageAutoDecodeStep()),
]
)
class QwenImageAutoBlocks(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = AUTO_BLOCKS.values()
block_names = AUTO_BLOCKS.keys()
@property
def description(self):
return (
"Auto Modular pipeline for text-to-image, image-to-image, inpainting, and controlnet tasks using QwenImage.\n"
+ "- for image-to-image generation, you need to provide `image`\n"
+ "- for inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
+ "- to run the controlnet workflow, you need to provide `control_image`\n"
+ "- for text-to-image generation, all you need to provide is `prompt`"
)
# 2. QwenImage-Edit
## 2.1 QwenImage-Edit/edit
#### QwenImage-Edit/edit vl encoder: take both image and text prompts
QwenImageEditVLEncoderBlocks = InsertableDict(
[
("resize", QwenImageEditResizeDynamicStep()),
("encode", QwenImageEditTextEncoderStep()),
]
)
class QwenImageEditVLEncoderStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageEditVLEncoderBlocks.values()
block_names = QwenImageEditVLEncoderBlocks.keys()
@property
def description(self) -> str:
return "QwenImage-Edit VL encoder step that encode the image an text prompts together."
#### QwenImage-Edit/edit vae encoder
QwenImageEditVaeEncoderBlocks = InsertableDict(
[
("resize", QwenImageEditResizeDynamicStep()), # edit has a different resize step
("preprocess", QwenImageProcessImagesInputStep()), # resized_image -> processed_image
("encode", QwenImageVaeEncoderDynamicStep()), # processed_image -> image_latents
]
)
class QwenImageEditVaeEncoderStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageEditVaeEncoderBlocks.values()
block_names = QwenImageEditVaeEncoderBlocks.keys()
@property
def description(self) -> str:
return "Vae encoder step that encode the image inputs into their latent representations."
#### QwenImage-Edit/edit input
QwenImageEditInputBlocks = InsertableDict(
[
("text_inputs", QwenImageTextInputsStep()), # default step to process text embeddings
("additional_inputs", QwenImageInputsDynamicStep(image_latent_inputs=["image_latents"])),
]
)
class QwenImageEditInputStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageEditInputBlocks.values()
block_names = QwenImageEditInputBlocks.keys()
@property
def description(self):
return "Input step that prepares the inputs for the edit denoising step. It:\n"
" - make sure the text embeddings have consistent batch size as well as the additional inputs: \n"
" - `image_latents`.\n"
" - update height/width based `image_latents`, patchify `image_latents`."
#### QwenImage/edit presets
EDIT_BLOCKS = InsertableDict(
[
("text_encoder", QwenImageEditVLEncoderStep()),
("vae_encoder", QwenImageEditVaeEncoderStep()),
("input", QwenImageEditInputStep()),
("prepare_latents", QwenImagePrepareLatentsStep()),
("set_timesteps", QwenImageSetTimestepsStep()),
("prepare_rope_inputs", QwenImageEditRoPEInputsStep()),
("denoise", QwenImageEditDenoiseStep()),
("decode", QwenImageDecodeStep()),
]
)
## 2.2 QwenImage-Edit/edit inpaint
#### QwenImage-Edit/edit inpaint vae encoder: the difference from regular inpaint is the resize step
QwenImageEditInpaintVaeEncoderBlocks = InsertableDict(
[
("resize", QwenImageEditResizeDynamicStep()), # image -> resized_image
(
"preprocess",
QwenImageInpaintProcessImagesInputStep,
), # resized_image, mask_image -> processed_image, processed_mask_image, mask_overlay_kwargs
(
"encode",
QwenImageVaeEncoderDynamicStep(input_name="processed_image", output_name="image_latents"),
), # processed_image -> image_latents
]
)
class QwenImageEditInpaintVaeEncoderStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageEditInpaintVaeEncoderBlocks.values()
block_names = QwenImageEditInpaintVaeEncoderBlocks.keys()
@property
def description(self) -> str:
return (
"This step is used for processing image and mask inputs for QwenImage-Edit inpaint tasks. It:\n"
" - resize the image for target area (1024 * 1024) while maintaining the aspect ratio.\n"
" - process the resized image and mask image.\n"
" - create image latents."
)
#### QwenImage-Edit/edit inpaint presets
EDIT_INPAINT_BLOCKS = InsertableDict(
[
("text_encoder", QwenImageEditVLEncoderStep()),
("vae_encoder", QwenImageEditInpaintVaeEncoderStep()),
("input", QwenImageInpaintInputStep()),
("prepare_latents", QwenImagePrepareLatentsStep()),
("set_timesteps", QwenImageSetTimestepsWithStrengthStep()),
("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()),
("prepare_rope_inputs", QwenImageEditRoPEInputsStep()),
("denoise", QwenImageEditInpaintDenoiseStep()),
("decode", QwenImageInpaintDecodeStep()),
]
)
## 2.3 QwenImage-Edit/auto encoders
class QwenImageEditAutoVaeEncoderStep(AutoPipelineBlocks):
block_classes = [
QwenImageEditInpaintVaeEncoderStep,
QwenImageEditVaeEncoderStep,
]
block_names = ["edit_inpaint", "edit"]
block_trigger_inputs = ["mask_image", "image"]
@property
def description(self):
return (
"Vae encoder step that encode the image inputs into their latent representations. \n"
" This is an auto pipeline block that works for edit and edit_inpaint tasks.\n"
+ " - `QwenImageEditInpaintVaeEncoderStep` (edit_inpaint) is used when `mask_image` is provided.\n"
+ " - `QwenImageEditVaeEncoderStep` (edit) is used when `image` is provided.\n"
+ " - if `mask_image` or `image` is not provided, step will be skipped."
)
## 2.4 QwenImage-Edit/auto inputs
class QwenImageEditAutoInputStep(AutoPipelineBlocks):
block_classes = [QwenImageInpaintInputStep, QwenImageEditInputStep]
block_names = ["edit_inpaint", "edit"]
block_trigger_inputs = ["processed_mask_image", "image"]
@property
def description(self):
return (
"Input step that prepares the inputs for the edit denoising step.\n"
+ " It is an auto pipeline block that works for edit and edit_inpaint tasks.\n"
+ " - `QwenImageInpaintInputStep` (edit_inpaint) is used when `processed_mask_image` is provided.\n"
+ " - `QwenImageEditInputStep` (edit) is used when `image_latents` is provided.\n"
+ " - if `processed_mask_image` or `image_latents` is not provided, step will be skipped."
)
## 2.5 QwenImage-Edit/auto before denoise
# compose the steps into a BeforeDenoiseStep for edit and edit_inpaint tasks before combine into an auto step
#### QwenImage-Edit/edit before denoise
QwenImageEditBeforeDenoiseBlocks = InsertableDict(
[
("prepare_latents", QwenImagePrepareLatentsStep()),
("set_timesteps", QwenImageSetTimestepsStep()),
("prepare_rope_inputs", QwenImageEditRoPEInputsStep()),
]
)
class QwenImageEditBeforeDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageEditBeforeDenoiseBlocks.values()
block_names = QwenImageEditBeforeDenoiseBlocks.keys()
@property
def description(self):
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit task."
#### QwenImage-Edit/edit inpaint before denoise
QwenImageEditInpaintBeforeDenoiseBlocks = InsertableDict(
[
("prepare_latents", QwenImagePrepareLatentsStep()),
("set_timesteps", QwenImageSetTimestepsWithStrengthStep()),
("prepare_inpaint_latents", QwenImageInpaintPrepareLatentsStep()),
("prepare_rope_inputs", QwenImageEditRoPEInputsStep()),
]
)
class QwenImageEditInpaintBeforeDenoiseStep(SequentialPipelineBlocks):
model_name = "qwenimage"
block_classes = QwenImageEditInpaintBeforeDenoiseBlocks.values()
block_names = QwenImageEditInpaintBeforeDenoiseBlocks.keys()
@property
def description(self):
return "Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step for edit inpaint task."
# auto before_denoise step for edit and edit_inpaint tasks
class QwenImageEditAutoBeforeDenoiseStep(AutoPipelineBlocks):
model_name = "qwenimage-edit"
block_classes = [
QwenImageEditInpaintBeforeDenoiseStep,
QwenImageEditBeforeDenoiseStep,
]
block_names = ["edit_inpaint", "edit"]
block_trigger_inputs = ["processed_mask_image", "image_latents"]
@property
def description(self):
return (
"Before denoise step that prepare the inputs (timesteps, latents, rope inputs etc.) for the denoise step.\n"
+ "This is an auto pipeline block that works for edit (img2img) and edit inpaint tasks.\n"
+ " - `QwenImageEditInpaintBeforeDenoiseStep` (edit_inpaint) is used when `processed_mask_image` is provided.\n"
+ " - `QwenImageEditBeforeDenoiseStep` (edit) is used when `image_latents` is provided and `processed_mask_image` is not provided.\n"
+ " - if `image_latents` or `processed_mask_image` is not provided, step will be skipped."
)
## 2.6 QwenImage-Edit/auto denoise
class QwenImageEditAutoDenoiseStep(AutoPipelineBlocks):
model_name = "qwenimage-edit"
block_classes = [QwenImageEditInpaintDenoiseStep, QwenImageEditDenoiseStep]
block_names = ["inpaint_denoise", "denoise"]
block_trigger_inputs = ["processed_mask_image", "image_latents"]
@property
def description(self):
return (
"Denoise step that iteratively denoise the latents. \n"
+ "This block supports edit (img2img) and edit inpaint tasks for QwenImage Edit. \n"
+ " - `QwenImageEditInpaintDenoiseStep` (inpaint) is used when `processed_mask_image` is provided.\n"
+ " - `QwenImageEditDenoiseStep` (img2img) is used when `image_latents` is provided.\n"
+ " - if `processed_mask_image` or `image_latents` is not provided, step will be skipped."
)
## 2.7 QwenImage-Edit/auto blocks & presets
EDIT_AUTO_BLOCKS = InsertableDict(
[
("text_encoder", QwenImageEditVLEncoderStep()),
("vae_encoder", QwenImageEditAutoVaeEncoderStep()),
("input", QwenImageEditAutoInputStep()),
("before_denoise", QwenImageEditAutoBeforeDenoiseStep()),
("denoise", QwenImageEditAutoDenoiseStep()),
("decode", QwenImageAutoDecodeStep()),
]
)
class QwenImageEditAutoBlocks(SequentialPipelineBlocks):
model_name = "qwenimage-edit"
block_classes = EDIT_AUTO_BLOCKS.values()
block_names = EDIT_AUTO_BLOCKS.keys()
@property
def description(self):
return (
"Auto Modular pipeline for edit (img2img) and edit inpaint tasks using QwenImage-Edit.\n"
+ "- for edit (img2img) generation, you need to provide `image`\n"
+ "- for edit inpainting, you need to provide `mask_image` and `image`, optionally you can provide `padding_mask_crop` \n"
)
# 3. all block presets supported in QwenImage & QwenImage-Edit
ALL_BLOCKS = {
"text2image": TEXT2IMAGE_BLOCKS,
"img2img": IMAGE2IMAGE_BLOCKS,
"edit": EDIT_BLOCKS,
"edit_inpaint": EDIT_INPAINT_BLOCKS,
"inpaint": INPAINT_BLOCKS,
"controlnet": CONTROLNET_BLOCKS,
"auto": AUTO_BLOCKS,
"edit_auto": EDIT_AUTO_BLOCKS,
}
@@ -0,0 +1,202 @@
# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ...configuration_utils import ConfigMixin, register_to_config
from ...loaders import QwenImageLoraLoaderMixin
from ..modular_pipeline import ModularPipeline
class QwenImagePachifier(ConfigMixin):
"""
A class to pack and unpack latents for QwenImage.
"""
config_name = "config.json"
@register_to_config
def __init__(
self,
patch_size: int = 2,
):
super().__init__()
def pack_latents(self, latents):
if latents.ndim != 4 and latents.ndim != 5:
raise ValueError(f"Latents must have 4 or 5 dimensions, but got {latents.ndim}")
if latents.ndim == 4:
latents = latents.unsqueeze(2)
batch_size, num_channels_latents, num_latent_frames, latent_height, latent_width = latents.shape
patch_size = self.config.patch_size
if latent_height % patch_size != 0 or latent_width % patch_size != 0:
raise ValueError(
f"Latent height and width must be divisible by {patch_size}, but got {latent_height} and {latent_width}"
)
latents = latents.view(
batch_size,
num_channels_latents,
latent_height // patch_size,
patch_size,
latent_width // patch_size,
patch_size,
)
latents = latents.permute(
0, 2, 4, 1, 3, 5
) # Batch_size, num_patches_height, num_patches_width, num_channels_latents, patch_size, patch_size
latents = latents.reshape(
batch_size,
(latent_height // patch_size) * (latent_width // patch_size),
num_channels_latents * patch_size * patch_size,
)
return latents
def unpack_latents(self, latents, height, width, vae_scale_factor=8):
if latents.ndim != 3:
raise ValueError(f"Latents must have 3 dimensions, but got {latents.ndim}")
batch_size, num_patches, channels = latents.shape
patch_size = self.config.patch_size
# VAE applies 8x compression on images but we must also account for packing which requires
# latent height and width to be divisible by 2.
height = patch_size * (int(height) // (vae_scale_factor * patch_size))
width = patch_size * (int(width) // (vae_scale_factor * patch_size))
latents = latents.view(
batch_size,
height // patch_size,
width // patch_size,
channels // (patch_size * patch_size),
patch_size,
patch_size,
)
latents = latents.permute(0, 3, 1, 4, 2, 5)
latents = latents.reshape(batch_size, channels // (patch_size * patch_size), 1, height, width)
return latents
class QwenImageModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin):
"""
A ModularPipeline for QwenImage.
<Tip warning={true}>
This is an experimental feature and is likely to change in the future.
</Tip>
"""
@property
def default_height(self):
return self.default_sample_size * self.vae_scale_factor
@property
def default_width(self):
return self.default_sample_size * self.vae_scale_factor
@property
def default_sample_size(self):
return 128
@property
def vae_scale_factor(self):
vae_scale_factor = 8
if hasattr(self, "vae") and self.vae is not None:
vae_scale_factor = 2 ** len(self.vae.temperal_downsample)
return vae_scale_factor
@property
def num_channels_latents(self):
num_channels_latents = 16
if hasattr(self, "transformer") and self.transformer is not None:
num_channels_latents = self.transformer.config.in_channels // 4
return num_channels_latents
@property
def is_guidance_distilled(self):
is_guidance_distilled = False
if hasattr(self, "transformer") and self.transformer is not None:
is_guidance_distilled = self.transformer.config.guidance_embeds
return is_guidance_distilled
@property
def requires_unconditional_embeds(self):
requires_unconditional_embeds = False
if hasattr(self, "guider") and self.guider is not None:
requires_unconditional_embeds = self.guider._enabled and self.guider.num_conditions > 1
return requires_unconditional_embeds
class QwenImageEditModularPipeline(ModularPipeline, QwenImageLoraLoaderMixin):
"""
A ModularPipeline for QwenImage-Edit.
<Tip warning={true}>
This is an experimental feature and is likely to change in the future.
</Tip>
"""
# YiYi TODO: qwen edit should not provide default height/width, should be derived from the resized input image (after adjustment) produced by the resize step.
@property
def default_height(self):
return self.default_sample_size * self.vae_scale_factor
@property
def default_width(self):
return self.default_sample_size * self.vae_scale_factor
@property
def default_sample_size(self):
return 128
@property
def vae_scale_factor(self):
vae_scale_factor = 8
if hasattr(self, "vae") and self.vae is not None:
vae_scale_factor = 2 ** len(self.vae.temperal_downsample)
return vae_scale_factor
@property
def num_channels_latents(self):
num_channels_latents = 16
if hasattr(self, "transformer") and self.transformer is not None:
num_channels_latents = self.transformer.config.in_channels // 4
return num_channels_latents
@property
def is_guidance_distilled(self):
is_guidance_distilled = False
if hasattr(self, "transformer") and self.transformer is not None:
is_guidance_distilled = self.transformer.config.guidance_embeds
return is_guidance_distilled
@property
def requires_unconditional_embeds(self):
requires_unconditional_embeds = False
if hasattr(self, "guider") and self.guider is not None:
requires_unconditional_embeds = self.guider._enabled and self.guider.num_conditions > 1
return requires_unconditional_embeds
@@ -22,7 +22,7 @@ from ...configuration_utils import FrozenDict
from ...guiders import ClassifierFreeGuidance
from ...image_processor import VaeImageProcessor
from ...models import AutoencoderKL, ControlNetModel, ControlNetUnionModel, UNet2DConditionModel
from ...pipelines.controlnet.multicontrolnet import MultiControlNetModel
from ...models.controlnets.multicontrolnet import MultiControlNetModel
from ...schedulers import EulerDiscreteScheduler
from ...utils import logging
from ...utils.torch_utils import randn_tensor, unwrap_module
@@ -695,7 +695,7 @@ class StableDiffusionXLDenoiseStep(StableDiffusionXLDenoiseLoopWrapper):
return (
"Denoise step that iteratively denoise the latents. \n"
"Its loop logic is defined in `StableDiffusionXLDenoiseLoopWrapper.__call__` method \n"
"At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
"At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
" - `StableDiffusionXLLoopBeforeDenoiser`\n"
" - `StableDiffusionXLLoopDenoiser`\n"
" - `StableDiffusionXLLoopAfterDenoiser`\n"
@@ -717,7 +717,7 @@ class StableDiffusionXLControlNetDenoiseStep(StableDiffusionXLDenoiseLoopWrapper
return (
"Denoise step that iteratively denoise the latents with controlnet. \n"
"Its loop logic is defined in `StableDiffusionXLDenoiseLoopWrapper.__call__` method \n"
"At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
"At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
" - `StableDiffusionXLLoopBeforeDenoiser`\n"
" - `StableDiffusionXLControlNetLoopDenoiser`\n"
" - `StableDiffusionXLLoopAfterDenoiser`\n"
@@ -739,7 +739,7 @@ class StableDiffusionXLInpaintDenoiseStep(StableDiffusionXLDenoiseLoopWrapper):
return (
"Denoise step that iteratively denoise the latents(for inpainting task only). \n"
"Its loop logic is defined in `StableDiffusionXLDenoiseLoopWrapper.__call__` method \n"
"At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
"At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
" - `StableDiffusionXLInpaintLoopBeforeDenoiser`\n"
" - `StableDiffusionXLLoopDenoiser`\n"
" - `StableDiffusionXLInpaintLoopAfterDenoiser`\n"
@@ -761,7 +761,7 @@ class StableDiffusionXLInpaintControlNetDenoiseStep(StableDiffusionXLDenoiseLoop
return (
"Denoise step that iteratively denoise the latents(for inpainting task only) with controlnet. \n"
"Its loop logic is defined in `StableDiffusionXLDenoiseLoopWrapper.__call__` method \n"
"At each iteration, it runs blocks defined in `sub_blocks` sequencially:\n"
"At each iteration, it runs blocks defined in `sub_blocks` sequentially:\n"
" - `StableDiffusionXLInpaintLoopBeforeDenoiser`\n"
" - `StableDiffusionXLControlNetLoopDenoiser`\n"
" - `StableDiffusionXLInpaintLoopAfterDenoiser`\n"
@@ -76,6 +76,7 @@ class StableDiffusionXLModularPipeline(
vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
return vae_scale_factor
# YiYi TODO: change to num_channels_latents
@property
def num_channels_unet(self):
num_channels_unet = 4

Some files were not shown because too many files have changed in this diff Show More