Compare commits
33 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 9862c93d6d | |||
| 71ae84da4a | |||
| 53f1043cbb | |||
| 1fa5639438 | |||
| ed4efbd63d | |||
| 9c29e938d7 | |||
| 071807c853 | |||
| ee1516e5c7 | |||
| ec9323996b | |||
| fc5e906689 | |||
| 8520d496f0 | |||
| a674914fd5 | |||
| ec3d58286d | |||
| ed6cf52572 | |||
| e23705e557 | |||
| b848d479b1 | |||
| d0c02398b9 | |||
| 5dcdf4ac9a | |||
| 86294d3c7f | |||
| d70f8ee18b | |||
| 06beecafc5 | |||
| daf0a23958 | |||
| 38ced7ee59 | |||
| 23c98025b3 | |||
| 8cd7426e56 | |||
| fbce7aeb32 | |||
| 35fada4169 | |||
| fbe2fe5578 | |||
| c86511586f | |||
| 60892c55a4 | |||
| 8fe5a14d9b | |||
| 58431f102c | |||
| 4a9ab650aa |
@@ -11,6 +11,7 @@ on:
|
||||
- "tests/**.py"
|
||||
- ".github/**.yml"
|
||||
- "utils/**.py"
|
||||
- "setup.py"
|
||||
push:
|
||||
branches:
|
||||
- ci-*
|
||||
|
||||
+16
-17
@@ -17,12 +17,8 @@
|
||||
title: AutoPipeline
|
||||
- local: tutorials/basic_training
|
||||
title: Train a diffusion model
|
||||
- local: tutorials/using_peft_for_inference
|
||||
title: Load LoRAs for inference
|
||||
- local: tutorials/fast_diffusion
|
||||
title: Accelerate inference of text-to-image diffusion models
|
||||
- local: tutorials/inference_with_big_models
|
||||
title: Working with big models
|
||||
title: Tutorials
|
||||
- sections:
|
||||
- local: using-diffusers/loading
|
||||
@@ -33,11 +29,24 @@
|
||||
title: Load schedulers and models
|
||||
- local: using-diffusers/other-formats
|
||||
title: Model files and layouts
|
||||
- local: using-diffusers/loading_adapters
|
||||
title: Load adapters
|
||||
- local: using-diffusers/push_to_hub
|
||||
title: Push files to the Hub
|
||||
title: Load pipelines and adapters
|
||||
- sections:
|
||||
- local: tutorials/using_peft_for_inference
|
||||
title: LoRA
|
||||
- local: using-diffusers/ip_adapter
|
||||
title: IP-Adapter
|
||||
- local: using-diffusers/controlnet
|
||||
title: ControlNet
|
||||
- local: using-diffusers/t2i_adapter
|
||||
title: T2I-Adapter
|
||||
- local: using-diffusers/dreambooth
|
||||
title: DreamBooth
|
||||
- local: using-diffusers/textual_inversion_inference
|
||||
title: Textual inversion
|
||||
title: Adapters
|
||||
isExpanded: false
|
||||
- sections:
|
||||
- local: using-diffusers/unconditional_image_generation
|
||||
title: Unconditional image generation
|
||||
@@ -59,8 +68,6 @@
|
||||
title: Create a server
|
||||
- local: training/distributed_inference
|
||||
title: Distributed inference
|
||||
- local: using-diffusers/merge_loras
|
||||
title: Merge LoRAs
|
||||
- local: using-diffusers/scheduler_features
|
||||
title: Scheduler features
|
||||
- local: using-diffusers/callback
|
||||
@@ -97,20 +104,12 @@
|
||||
title: SDXL Turbo
|
||||
- local: using-diffusers/kandinsky
|
||||
title: Kandinsky
|
||||
- local: using-diffusers/ip_adapter
|
||||
title: IP-Adapter
|
||||
- local: using-diffusers/omnigen
|
||||
title: OmniGen
|
||||
- local: using-diffusers/pag
|
||||
title: PAG
|
||||
- local: using-diffusers/controlnet
|
||||
title: ControlNet
|
||||
- local: using-diffusers/t2i_adapter
|
||||
title: T2I-Adapter
|
||||
- local: using-diffusers/inference_with_lcm
|
||||
title: Latent Consistency Model
|
||||
- local: using-diffusers/textual_inversion_inference
|
||||
title: Textual inversion
|
||||
- local: using-diffusers/shap-e
|
||||
title: Shap-E
|
||||
- local: using-diffusers/diffedit
|
||||
@@ -180,7 +179,7 @@
|
||||
title: Quantization Methods
|
||||
- sections:
|
||||
- local: optimization/fp16
|
||||
title: Speed up inference
|
||||
title: Accelerate inference
|
||||
- local: optimization/memory
|
||||
title: Reduce memory usage
|
||||
- local: optimization/torch2.0
|
||||
|
||||
@@ -966,7 +966,7 @@ pipe.to("cuda")
|
||||
prompt = {
|
||||
0: "A caterpillar on a leaf, high quality, photorealistic",
|
||||
40: "A caterpillar transforming into a cocoon, on a leaf, near flowers, photorealistic",
|
||||
80: "A cocoon on a leaf, flowers in the backgrond, photorealistic",
|
||||
80: "A cocoon on a leaf, flowers in the background, photorealistic",
|
||||
120: "A cocoon maturing and a butterfly being born, flowers and leaves visible in the background, photorealistic",
|
||||
160: "A beautiful butterfly, vibrant colors, sitting on a leaf, flowers in the background, photorealistic",
|
||||
200: "A beautiful butterfly, flying away in a forest, photorealistic",
|
||||
|
||||
@@ -29,7 +29,7 @@ You can find additional information about LEDITS++ on the [project page](https:/
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
Due to some backward compatability issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion.
|
||||
Due to some backward compatibility issues with the current diffusers implementation of [`~schedulers.DPMSolverMultistepScheduler`] this implementation of LEdits++ can no longer guarantee perfect inversion.
|
||||
This issue is unlikely to have any noticeable effects on applied use-cases. However, we provide an alternative implementation that guarantees perfect inversion in a dedicated [GitHub repo](https://github.com/ml-research/ledits_pp).
|
||||
</Tip>
|
||||
|
||||
|
||||
@@ -285,7 +285,7 @@ pipe = WanImageToVideoPipeline.from_pretrained(
|
||||
image_encoder=image_encoder,
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
# Since we've offloaded the larger models alrady, we can move the rest of the model components to GPU
|
||||
# Since we've offloaded the larger models already, we can move the rest of the model components to GPU
|
||||
pipe.to("cuda")
|
||||
|
||||
image = load_image(
|
||||
@@ -368,7 +368,7 @@ pipe = WanImageToVideoPipeline.from_pretrained(
|
||||
image_encoder=image_encoder,
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
# Since we've offloaded the larger models alrady, we can move the rest of the model components to GPU
|
||||
# Since we've offloaded the larger models already, we can move the rest of the model components to GPU
|
||||
pipe.to("cuda")
|
||||
|
||||
image = load_image(
|
||||
|
||||
@@ -10,120 +10,211 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Speed up inference
|
||||
# Accelerate inference
|
||||
|
||||
There are several ways to optimize Diffusers for inference speed, such as reducing the computational burden by lowering the data precision or using a lightweight distilled model. There are also memory-efficient attention implementations, [xFormers](xformers) and [scaled dot product attention](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) in PyTorch 2.0, that reduce memory usage which also indirectly speeds up inference. Different speed optimizations can be stacked together to get the fastest inference times.
|
||||
Diffusion models are slow at inference because generation is an iterative process where noise is gradually refined into an image or video over a certain number of "steps". To speedup this process, you can try experimenting with different [schedulers](../api/schedulers/overview), reduce the precision of the model weights for faster computations, use more memory-efficient attention mechanisms, and more.
|
||||
|
||||
> [!TIP]
|
||||
> Optimizing for inference speed or reduced memory usage can lead to improved performance in the other category, so you should try to optimize for both whenever you can. This guide focuses on inference speed, but you can learn more about lowering memory usage in the [Reduce memory usage](memory) guide.
|
||||
Combine and use these techniques together to make inference faster than using any single technique on its own.
|
||||
|
||||
The inference times below are obtained from generating a single 512x512 image from the prompt "a photo of an astronaut riding a horse on mars" with 50 DDIM steps on a NVIDIA A100.
|
||||
This guide will go over how to accelerate inference.
|
||||
|
||||
| setup | latency | speed-up |
|
||||
|----------|---------|----------|
|
||||
| baseline | 5.27s | x1 |
|
||||
| tf32 | 4.14s | x1.27 |
|
||||
| fp16 | 3.51s | x1.50 |
|
||||
| combined | 3.41s | x1.54 |
|
||||
## Model data type
|
||||
|
||||
## TensorFloat-32
|
||||
The precision and data type of the model weights affect inference speed because a higher precision requires more memory to load and more time to perform the computations. PyTorch loads model weights in float32 or full precision by default, so changing the data type is a simple way to quickly get faster inference.
|
||||
|
||||
On Ampere and later CUDA devices, matrix multiplications and convolutions can use the [TensorFloat-32 (tf32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode for faster, but slightly less accurate computations. By default, PyTorch enables tf32 mode for convolutions but not matrix multiplications. Unless your network requires full float32 precision, we recommend enabling tf32 for matrix multiplications. It can significantly speed up computations with typically negligible loss in numerical accuracy.
|
||||
<hfoptions id="dtypes">
|
||||
<hfoption id="bfloat16">
|
||||
|
||||
```python
|
||||
bfloat16 is similar to float16 but it is more robust to numerical errors. Hardware support for bfloat16 varies, but most modern GPUs are capable of supporting bfloat16.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
|
||||
).to("cuda")
|
||||
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
pipeline(prompt, num_inference_steps=30).images[0]
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="float16">
|
||||
|
||||
float16 is similar to bfloat16 but may be more prone to numerical errors.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
pipeline(prompt, num_inference_steps=30).images[0]
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="TensorFloat-32">
|
||||
|
||||
[TensorFloat-32 (tf32)](https://blogs.nvidia.com/blog/2020/05/14/tensorfloat-32-precision-format/) mode is supported on NVIDIA Ampere GPUs and it computes the convolution and matrix multiplication operations in tf32. Storage and other operations are kept in float32. This enables significantly faster computations when combined with bfloat16 or float16.
|
||||
|
||||
PyTorch only enables tf32 mode for convolutions by default and you'll need to explicitly enable it for matrix multiplications.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
torch.backends.cuda.matmul.allow_tf32 = True
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
|
||||
).to("cuda")
|
||||
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
pipeline(prompt, num_inference_steps=30).images[0]
|
||||
```
|
||||
|
||||
Learn more about tf32 in the [Mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#tf32) guide.
|
||||
Refer to the [mixed precision training](https://huggingface.co/docs/transformers/en/perf_train_gpu_one#mixed-precision) docs for more details.
|
||||
|
||||
## Half-precision weights
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
To save GPU memory and get more speed, set `torch_dtype=torch.float16` to load and run the model weights directly with half-precision weights.
|
||||
|
||||
```Python
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
```
|
||||
|
||||
> [!WARNING]
|
||||
> Don't use [torch.autocast](https://pytorch.org/docs/stable/amp.html#torch.autocast) in any of the pipelines as it can lead to black images and is always slower than pure float16 precision.
|
||||
|
||||
## Distilled model
|
||||
|
||||
You could also use a distilled Stable Diffusion model and autoencoder to speed up inference. During distillation, many of the UNet's residual and attention blocks are shed to reduce the model size by 51% and improve latency on CPU/GPU by 43%. The distilled model is faster and uses less memory while generating images of comparable quality to the full Stable Diffusion model.
|
||||
## Scaled dot product attention
|
||||
|
||||
> [!TIP]
|
||||
> Read the [Open-sourcing Knowledge Distillation Code and Weights of SD-Small and SD-Tiny](https://huggingface.co/blog/sd_distillation) blog post to learn more about how knowledge distillation training works to produce a faster, smaller, and cheaper generative model.
|
||||
> Memory-efficient attention optimizes for inference speed *and* [memory usage](./memory#memory-efficient-attention)!
|
||||
|
||||
The inference times below are obtained from generating 4 images from the prompt "a photo of an astronaut riding a horse on mars" with 25 PNDM steps on a NVIDIA A100. Each generation is repeated 3 times with the distilled Stable Diffusion v1.4 model by [Nota AI](https://hf.co/nota-ai).
|
||||
[Scaled dot product attention (SDPA)](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) implements several attention backends, [FlashAttention](https://github.com/Dao-AILab/flash-attention), [xFormers](https://github.com/facebookresearch/xformers), and a native C++ implementation. It automatically selects the most optimal backend for your hardware.
|
||||
|
||||
| setup | latency | speed-up |
|
||||
|------------------------------|---------|----------|
|
||||
| baseline | 6.37s | x1 |
|
||||
| distilled | 4.18s | x1.52 |
|
||||
| distilled + tiny autoencoder | 3.83s | x1.66 |
|
||||
|
||||
Let's load the distilled Stable Diffusion model and compare it against the original Stable Diffusion model.
|
||||
SDPA is enabled by default if you're using PyTorch >= 2.0 and no additional changes are required to your code. You could try experimenting with other attention backends though if you'd like to choose your own. The example below uses the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to enable efficient attention.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionPipeline
|
||||
from torch.nn.attention import SDPBackend, sdpa_kernel
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
distilled = StableDiffusionPipeline.from_pretrained(
|
||||
"nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
|
||||
).to("cuda")
|
||||
prompt = "a golden vase with different flowers"
|
||||
generator = torch.manual_seed(2023)
|
||||
image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0]
|
||||
image
|
||||
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
|
||||
with sdpa_kernel(SDPBackend.EFFICIENT_ATTENTION):
|
||||
image = pipeline(prompt, num_inference_steps=30).images[0]
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/original_sd.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original Stable Diffusion</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
## torch.compile
|
||||
|
||||
### Tiny AutoEncoder
|
||||
[torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) accelerates inference by compiling PyTorch code and operations into optimized kernels. Diffusers typically compiles the more compute-intensive models like the UNet, transformer, or VAE.
|
||||
|
||||
To speed inference up even more, replace the autoencoder with a [distilled version](https://huggingface.co/sayakpaul/taesdxl-diffusers) of it.
|
||||
Enable the following compiler settings for maximum speed (refer to the [full list](https://github.com/pytorch/pytorch/blob/main/torch/_inductor/config.py) for more options).
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoencoderTiny, StableDiffusionPipeline
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
distilled = StableDiffusionPipeline.from_pretrained(
|
||||
"nota-ai/bk-sdm-small", torch_dtype=torch.float16, use_safetensors=True,
|
||||
).to("cuda")
|
||||
distilled.vae = AutoencoderTiny.from_pretrained(
|
||||
"sayakpaul/taesd-diffusers", torch_dtype=torch.float16, use_safetensors=True,
|
||||
).to("cuda")
|
||||
|
||||
prompt = "a golden vase with different flowers"
|
||||
generator = torch.manual_seed(2023)
|
||||
image = distilled("a golden vase with different flowers", num_inference_steps=25, generator=generator).images[0]
|
||||
image
|
||||
torch._inductor.config.conv_1x1_as_mm = True
|
||||
torch._inductor.config.coordinate_descent_tuning = True
|
||||
torch._inductor.config.epilogue_fusion = False
|
||||
torch._inductor.config.coordinate_descent_check_all_directions = True
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/distilled_sd_vae.png" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">distilled Stable Diffusion + Tiny AutoEncoder</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
Load and compile the UNet and VAE. There are several different modes you can choose from, but `"max-autotune"` optimizes for the fastest speed by compiling to a CUDA graph. CUDA graphs effectively reduces the overhead by launching multiple GPU operations through a single CPU operation.
|
||||
|
||||
More tiny autoencoder models for other Stable Diffusion models, like Stable Diffusion 3, are available from [madebyollin](https://huggingface.co/madebyollin).
|
||||
> [!TIP]
|
||||
> With PyTorch 2.3.1, you can control the caching behavior of torch.compile. This is particularly beneficial for compilation modes like `"max-autotune"` which performs a grid-search over several compilation flags to find the optimal configuration. Learn more in the [Compile Time Caching in torch.compile](https://pytorch.org/tutorials/recipes/torch_compile_caching_tutorial.html) tutorial.
|
||||
|
||||
Changing the memory layout to [channels_last](./memory#torchchannels_last) also optimizes memory and inference speed.
|
||||
|
||||
```py
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.unet.to(memory_format=torch.channels_last)
|
||||
pipeline.vae.to(memory_format=torch.channels_last)
|
||||
pipeline.unet = torch.compile(
|
||||
pipeline.unet, mode="max-autotune", fullgraph=True
|
||||
)
|
||||
pipeline.vae.decode = torch.compile(
|
||||
pipeline.vae.decode,
|
||||
mode="max-autotune",
|
||||
fullgraph=True
|
||||
)
|
||||
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
pipeline(prompt, num_inference_steps=30).images[0]
|
||||
```
|
||||
|
||||
Compilation is slow the first time, but once compiled, it is significantly faster. Try to only use the compiled pipeline on the same type of inference operations. Calling the compiled pipeline on a different image size retriggers compilation which is slow and inefficient.
|
||||
|
||||
### Graph breaks
|
||||
|
||||
It is important to specify `fullgraph=True` in torch.compile to ensure there are no graph breaks in the underlying model. This allows you to take advantage of torch.compile without any performance degradation. For the UNet and VAE, this changes how you access the return variables.
|
||||
|
||||
```diff
|
||||
- latents = unet(
|
||||
- latents, timestep=timestep, encoder_hidden_states=prompt_embeds
|
||||
-).sample
|
||||
|
||||
+ latents = unet(
|
||||
+ latents, timestep=timestep, encoder_hidden_states=prompt_embeds, return_dict=False
|
||||
+)[0]
|
||||
```
|
||||
|
||||
### GPU sync
|
||||
|
||||
The `step()` function is [called](https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py#L1228) on the scheduler each time after the denoiser makes a prediction, and the `sigmas` variable is [indexed](https://github.com/huggingface/diffusers/blob/1d686bac8146037e97f3fd8c56e4063230f71751/src/diffusers/schedulers/scheduling_euler_discrete.py#L476). When placed on the GPU, it introduces latency because of the communication sync between the CPU and GPU. It becomes more evident when the denoiser has already been compiled.
|
||||
|
||||
In general, the `sigmas` should [stay on the CPU](https://github.com/huggingface/diffusers/blob/35a969d297cba69110d175ee79c59312b9f49e1e/src/diffusers/schedulers/scheduling_euler_discrete.py#L240) to avoid the communication sync and latency.
|
||||
|
||||
## Dynamic quantization
|
||||
|
||||
[Dynamic quantization](https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html) improves inference speed by reducing precision to enable faster math operations. This particular type of quantization determines how to scale the activations based on the data at runtime rather than using a fixed scaling factor. As a result, the scaling factor is more accurately aligned with the data.
|
||||
|
||||
The example below applies [dynamic int8 quantization](https://pytorch.org/tutorials/recipes/recipes/dynamic_quantization.html) to the UNet and VAE with the [torchao](../quantization/torchao) library.
|
||||
|
||||
> [!TIP]
|
||||
> Refer to our [torchao](../quantization/torchao) docs to learn more about how to use the Diffusers torchao integration.
|
||||
|
||||
Configure the compiler tags for maximum speed.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from torchao import apply_dynamic_quant
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
torch._inductor.config.conv_1x1_as_mm = True
|
||||
torch._inductor.config.coordinate_descent_tuning = True
|
||||
torch._inductor.config.epilogue_fusion = False
|
||||
torch._inductor.config.coordinate_descent_check_all_directions = True
|
||||
torch._inductor.config.force_fuse_int_mm_with_mul = True
|
||||
torch._inductor.config.use_mixed_mm = True
|
||||
```
|
||||
|
||||
Filter out some linear layers in the UNet and VAE which don't benefit from dynamic quantization with the [dynamic_quant_filter_fn](https://github.com/huggingface/diffusion-fast/blob/0f169640b1db106fe6a479f78c1ed3bfaeba3386/utils/pipeline_utils.py#L16).
|
||||
|
||||
```py
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.bfloat16
|
||||
).to("cuda")
|
||||
|
||||
apply_dynamic_quant(pipeline.unet, dynamic_quant_filter_fn)
|
||||
apply_dynamic_quant(pipeline.vae, dynamic_quant_filter_fn)
|
||||
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
pipeline(prompt, num_inference_steps=30).images[0]
|
||||
```
|
||||
|
||||
## Fused projection matrices
|
||||
|
||||
> [!WARNING]
|
||||
> The [fuse_qkv_projections](https://github.com/huggingface/diffusers/blob/58431f102cf39c3c8a569f32d71b2ea8caa461e1/src/diffusers/pipelines/pipeline_utils.py#L2034) method is experimental and support is limited to mostly Stable Diffusion pipelines. Take a look at this [PR](https://github.com/huggingface/diffusers/pull/6179) to learn more about how to enable it for other pipelines
|
||||
|
||||
An input is projected into three subspaces, represented by the projection matrices Q, K, and V, in an attention block. These projections are typically calculated separately, but you can horizontally combine these into a single matrix and perform the projection in a single step. It increases the size of the matrix multiplications of the input projections and also improves the impact of quantization.
|
||||
|
||||
```py
|
||||
pipeline.fuse_qkv_projections()
|
||||
```
|
||||
@@ -12,178 +12,258 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Reduce memory usage
|
||||
|
||||
A barrier to using diffusion models is the large amount of memory required. To overcome this challenge, there are several memory-reducing techniques you can use to run even some of the largest models on free-tier or consumer GPUs. Some of these techniques can even be combined to further reduce memory usage.
|
||||
Modern diffusion models like [Flux](../api/pipelines/flux) and [Wan](../api/pipelines/wan) have billions of parameters that take up a lot of memory on your hardware for inference. This is challenging because common GPUs often don't have sufficient memory. To overcome the memory limitations, you can use more than one GPU (if available), offload some of the pipeline components to the CPU, and more.
|
||||
|
||||
<Tip>
|
||||
This guide will show you how to reduce your memory usage.
|
||||
|
||||
In many cases, optimizing for memory or speed leads to improved performance in the other, so you should try to optimize for both whenever you can. This guide focuses on minimizing memory usage, but you can also learn more about how to [Speed up inference](fp16).
|
||||
> [!TIP]
|
||||
> Keep in mind these techniques may need to be adjusted depending on the model! For example, a transformer-based diffusion model may not benefit equally from these inference speed optimizations as a UNet-based model.
|
||||
|
||||
</Tip>
|
||||
## Multiple GPUs
|
||||
|
||||
The results below are obtained from generating a single 512x512 image from the prompt a photo of an astronaut riding a horse on mars with 50 DDIM steps on a Nvidia Titan RTX, demonstrating the speed-up you can expect as a result of reduced memory consumption.
|
||||
If you have access to more than one GPU, there a few options for efficiently loading and distributing a large model across your hardware. These features are supported by the [Accelerate](https://huggingface.co/docs/accelerate/index) library, so make sure it is installed first.
|
||||
|
||||
| | latency | speed-up |
|
||||
| ---------------- | ------- | ------- |
|
||||
| original | 9.50s | x1 |
|
||||
| fp16 | 3.61s | x2.63 |
|
||||
| channels last | 3.30s | x2.88 |
|
||||
| traced UNet | 3.21s | x2.96 |
|
||||
| memory-efficient attention | 2.63s | x3.61 |
|
||||
|
||||
## Sliced VAE
|
||||
|
||||
Sliced VAE enables decoding large batches of images with limited VRAM or batches with 32 images or more by decoding the batches of latents one image at a time. You'll likely want to couple this with [`~ModelMixin.enable_xformers_memory_efficient_attention`] to reduce memory use further if you have xFormers installed.
|
||||
|
||||
To use sliced VAE, call [`~StableDiffusionPipeline.enable_vae_slicing`] on your pipeline before inference:
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
)
|
||||
pipe = pipe.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
pipe.enable_vae_slicing()
|
||||
#pipe.enable_xformers_memory_efficient_attention()
|
||||
images = pipe([prompt] * 32).images
|
||||
```bash
|
||||
pip install -U accelerate
|
||||
```
|
||||
|
||||
You may see a small performance boost in VAE decoding on multi-image batches, and there should be no performance impact on single-image batches.
|
||||
### Sharded checkpoints
|
||||
|
||||
## Tiled VAE
|
||||
Loading large checkpoints in several shards in useful because the shards are loaded one at a time. This keeps memory usage low, only requiring enough memory for the model size and the largest shard size. We recommend sharding when the fp32 checkpoint is greater than 5GB. The default shard size is 5GB.
|
||||
|
||||
Tiled VAE processing also enables working with large images on limited VRAM (for example, generating 4k images on 8GB of VRAM) by splitting the image into overlapping tiles, decoding the tiles, and then blending the outputs together to compose the final image. You should also used tiled VAE with [`~ModelMixin.enable_xformers_memory_efficient_attention`] to reduce memory use further if you have xFormers installed.
|
||||
Shard a checkpoint in [`~DiffusionPipeline.save_pretrained`] with the `max_shard_size` parameter.
|
||||
|
||||
To use tiled VAE processing, call [`~StableDiffusionPipeline.enable_vae_tiling`] on your pipeline before inference:
|
||||
```py
|
||||
from diffusers import AutoModel
|
||||
|
||||
```python
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline, UniPCMultistepScheduler
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
unet = AutoModel.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet"
|
||||
)
|
||||
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
pipe = pipe.to("cuda")
|
||||
prompt = "a beautiful landscape photograph"
|
||||
pipe.enable_vae_tiling()
|
||||
#pipe.enable_xformers_memory_efficient_attention()
|
||||
|
||||
image = pipe([prompt], width=3840, height=2224, num_inference_steps=20).images[0]
|
||||
unet.save_pretrained("sdxl-unet-sharded", max_shard_size="5GB")
|
||||
```
|
||||
|
||||
The output image has some tile-to-tile tone variation because the tiles are decoded separately, but you shouldn't see any sharp and obvious seams between the tiles. Tiling is turned off for images that are 512x512 or smaller.
|
||||
Now you can use the sharded checkpoint, instead of the regular checkpoint, to save memory.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoModel, StableDiffusionXLPipeline
|
||||
|
||||
unet = AutoModel.from_pretrained(
|
||||
"username/sdxl-unet-sharded", torch_dtype=torch.float16
|
||||
)
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
unet=unet,
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
### Device placement
|
||||
|
||||
> [!WARNING]
|
||||
> Device placement is an experimental feature and the API may change. Only the `balanced` strategy is supported at the moment. We plan to support additional mapping strategies in the future.
|
||||
|
||||
The `device_map` parameter controls how the model components in a pipeline are distributed across devices. The `balanced` device placement strategy evenly splits the pipeline across all available devices.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoModel, StableDiffusionXLPipeline
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
device_map="balanced"
|
||||
)
|
||||
```
|
||||
|
||||
You can inspect a pipeline's device map with `hf_device_map`.
|
||||
|
||||
```py
|
||||
print(pipeline.hf_device_map)
|
||||
{'unet': 1, 'vae': 1, 'safety_checker': 0, 'text_encoder': 0}
|
||||
```
|
||||
|
||||
The `device_map` parameter also works on the model-level. This is useful for loading large models, such as the Flux diffusion transformer which has 12.5B parameters. Instead of `balanced`, set it to `"auto"` to automatically distribute a model across the fastest device first before moving to slower devices. Refer to the [Model sharding](../training/distributed_inference#model-sharding) docs for more details.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoModel
|
||||
|
||||
transformer = AutoModel.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev",
|
||||
subfolder="transformer",
|
||||
device_map="auto",
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
```
|
||||
|
||||
For more fine-grained control, pass a dictionary to enforce the maximum GPU memory to use on each device. If a device is not in `max_memory`, it is ignored and pipeline components won't be distributed to it.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoModel, StableDiffusionXLPipeline
|
||||
|
||||
max_memory = {0:"1GB", 1:"1GB"}
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
device_map="balanced",
|
||||
max_memory=max_memory
|
||||
)
|
||||
```
|
||||
|
||||
Diffusers uses the maxmium memory of all devices by default, but if they don't fit on the GPUs, then you'll need to use a single GPU and offload to the CPU with the methods below.
|
||||
|
||||
- [`~DiffusionPipeline.enable_model_cpu_offload`] only works on a single GPU but a very large model may not fit on it
|
||||
- [`~DiffusionPipeline.enable_sequential_cpu_offload`] may work but it is extremely slow and also limited to a single GPU
|
||||
|
||||
Use the [`~DiffusionPipeline.reset_device_map`] method to reset the `device_map`. This is necessary if you want to use methods like `.to()`, [`~DiffusionPipeline.enable_sequential_cpu_offload`], and [`~DiffusionPipeline.enable_model_cpu_offload`] on a pipeline that was device-mapped.
|
||||
|
||||
```py
|
||||
pipeline.reset_device_map()
|
||||
```
|
||||
|
||||
## VAE slicing
|
||||
|
||||
VAE slicing saves memory by splitting large batches of inputs into a single batch of data and separately processing them. This method works best when generating more than one image at a time.
|
||||
|
||||
For example, if you're generating 4 images at once, decoding would increase peak activation memory by 4x. VAE slicing reduces this by only decoding 1 image at a time instead of all 4 images at once.
|
||||
|
||||
Call [`~StableDiffusionPipeline.enable_vae_slicing`] to enable sliced VAE. You can expect a small increase in performance when decoding multi-image batches and no performance impact for single-image batches.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoModel, StableDiffusionXLPipeline
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
pipeline.enable_vae_slicing()
|
||||
pipeline(["An astronaut riding a horse on Mars"]*32).images[0]
|
||||
print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
|
||||
```
|
||||
|
||||
> [!WARNING]
|
||||
> [`AutoencoderKLWan`] and [`AsymmetricAutoencoderKL`] don't support slicing.
|
||||
|
||||
## VAE tiling
|
||||
|
||||
VAE tiling saves memory by dividing an image into smaller overlapping tiles instead of processing the entire image at once. This also reduces peak memory usage because the GPU is only processing a tile at a time.
|
||||
|
||||
Call [`~StableDiffusionPipeline.enable_vae_tiling`] to enable VAE tiling. The generated image may have some tone variation from tile-to-tile because they're decoded separately, but there shouldn't be any obvious seams between the tiles. Tiling is disabled for resolutions lower than a pre-specified (but configurable) limit. For example, this limit is 512x512 for the VAE in [`StableDiffusionPipeline`].
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForImage2Image
|
||||
from diffusers.utils import load_image
|
||||
|
||||
pipeline = AutoPipelineForImage2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.enable_vae_tiling()
|
||||
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-sdxl-init.png")
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
pipeline(prompt, image=init_image, strength=0.5).images[0]
|
||||
print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
|
||||
```
|
||||
|
||||
> [!WARNING]
|
||||
> [`AutoencoderKLWan`] and [`AsymmetricAutoencoderKL`] don't support tiling.
|
||||
|
||||
## CPU offloading
|
||||
|
||||
Offloading the weights to the CPU and only loading them on the GPU when performing the forward pass can also save memory. Often, this technique can reduce memory consumption to less than 3GB.
|
||||
CPU offloading selectively moves weights from the GPU to the CPU. When a component is required, it is transferred to the GPU and when it isn't required, it is moved to the CPU. This method works on submodules rather than whole models. It saves memory by avoiding storing the entire model on the GPU.
|
||||
|
||||
To perform CPU offloading, call [`~StableDiffusionPipeline.enable_sequential_cpu_offload`]:
|
||||
CPU offloading dramatically reduces memory usage, but it is also **extremely slow** because submodules are passed back and forth multiple times between devices. It can often be impractical due to how slow it is.
|
||||
|
||||
```Python
|
||||
> [!WARNING]
|
||||
> Don't move the pipeline to CUDA before calling [`~DiffusionPipeline.enable_sequential_cpu_offload`], otherwise the amount of memory saved is only minimal (refer to this [issue](https://github.com/huggingface/diffusers/issues/1934) for more details). This is a stateful operation that installs hooks on the model.
|
||||
|
||||
Call [`~DiffusionPipeline.enable_sequential_cpu_offload`] to enable it on a pipeline.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16
|
||||
)
|
||||
pipeline.enable_sequential_cpu_offload()
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
pipe.enable_sequential_cpu_offload()
|
||||
image = pipe(prompt).images[0]
|
||||
pipeline(
|
||||
prompt="An astronaut riding a horse on Mars",
|
||||
guidance_scale=0.,
|
||||
height=768,
|
||||
width=1360,
|
||||
num_inference_steps=4,
|
||||
max_sequence_length=256,
|
||||
).images[0]
|
||||
print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
|
||||
```
|
||||
|
||||
CPU offloading works on submodules rather than whole models. This is the best way to minimize memory consumption, but inference is much slower due to the iterative nature of the diffusion process. The UNet component of the pipeline runs several times (as many as `num_inference_steps`); each time, the different UNet submodules are sequentially onloaded and offloaded as needed, resulting in a large number of memory transfers.
|
||||
|
||||
<Tip>
|
||||
|
||||
Consider using [model offloading](#model-offloading) if you want to optimize for speed because it is much faster. The tradeoff is your memory savings won't be as large.
|
||||
|
||||
</Tip>
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
When using [`~StableDiffusionPipeline.enable_sequential_cpu_offload`], don't move the pipeline to CUDA beforehand or else the gain in memory consumption will only be minimal (see this [issue](https://github.com/huggingface/diffusers/issues/1934) for more information).
|
||||
|
||||
[`~StableDiffusionPipeline.enable_sequential_cpu_offload`] is a stateful operation that installs hooks on the models.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Model offloading
|
||||
|
||||
<Tip>
|
||||
Model offloading moves entire models to the GPU instead of selectively moving *some* layers or model components. One of the main pipeline models, usually the text encoder, UNet, and VAE, is placed on the GPU while the other components are held on the CPU. Components like the UNet that run multiple times stays on the GPU until its completely finished and no longer needed. This eliminates the communication overhead of [CPU offloading](#cpu-offloading) and makes model offloading a faster alternative. The tradeoff is memory savings won't be as large.
|
||||
|
||||
Model offloading requires 🤗 Accelerate version 0.17.0 or higher.
|
||||
> [!WARNING]
|
||||
> Keep in mind that if models are reused outside the pipeline after hookes have been installed (see [Removing Hooks](https://huggingface.co/docs/accelerate/en/package_reference/big_modeling#accelerate.hooks.remove_hook_from_module) for more details), you need to run the entire pipeline and models in the expected order to properly offload them. This is a stateful operation that installs hooks on the model.
|
||||
|
||||
</Tip>
|
||||
Call [`~DiffusionPipeline.enable_model_cpu_offload`] to enable it on a pipeline.
|
||||
|
||||
[Sequential CPU offloading](#cpu-offloading) preserves a lot of memory but it makes inference slower because submodules are moved to GPU as needed, and they're immediately returned to the CPU when a new module runs.
|
||||
|
||||
Full-model offloading is an alternative that moves whole models to the GPU, instead of handling each model's constituent *submodules*. There is a negligible impact on inference time (compared with moving the pipeline to `cuda`), and it still provides some memory savings.
|
||||
|
||||
During model offloading, only one of the main components of the pipeline (typically the text encoder, UNet and VAE)
|
||||
is placed on the GPU while the others wait on the CPU. Components like the UNet that run for multiple iterations stay on the GPU until they're no longer needed.
|
||||
|
||||
Enable model offloading by calling [`~StableDiffusionPipeline.enable_model_cpu_offload`] on the pipeline:
|
||||
|
||||
```Python
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-schnell", torch_dtype=torch.bfloat16
|
||||
)
|
||||
pipline.enable_model_cpu_offload()
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
pipe.enable_model_cpu_offload()
|
||||
image = pipe(prompt).images[0]
|
||||
pipeline(
|
||||
prompt="An astronaut riding a horse on Mars",
|
||||
guidance_scale=0.,
|
||||
height=768,
|
||||
width=1360,
|
||||
num_inference_steps=4,
|
||||
max_sequence_length=256,
|
||||
).images[0]
|
||||
print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
In order to properly offload models after they're called, it is required to run the entire pipeline and models are called in the pipeline's expected order. Exercise caution if models are reused outside the context of the pipeline after hooks have been installed. See [Removing Hooks](https://huggingface.co/docs/accelerate/en/package_reference/big_modeling#accelerate.hooks.remove_hook_from_module) for more information.
|
||||
|
||||
[`~StableDiffusionPipeline.enable_model_cpu_offload`] is a stateful operation that installs hooks on the models and state on the pipeline.
|
||||
|
||||
</Tip>
|
||||
[`~DiffusionPipeline.enable_model_cpu_offload`] also helps when you're using the [`~StableDiffusionXLPipeline.encode_prompt`] method on its own to generate the text encoders hidden state.
|
||||
|
||||
## Group offloading
|
||||
|
||||
Group offloading is the middle ground between sequential and model offloading. It works by offloading groups of internal layers (either `torch.nn.ModuleList` or `torch.nn.Sequential`), which uses less memory than model-level offloading. It is also faster than sequential-level offloading because the number of device synchronizations is reduced.
|
||||
Group offloading moves groups of internal layers ([torch.nn.ModuleList](https://pytorch.org/docs/stable/generated/torch.nn.ModuleList.html) or [torch.nn.Sequential](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html)) to the CPU. It uses less memory than [model offloading](#model-offloading) and it is faster than [CPU offloading](#cpu-offloading) because it reduces communication overhead.
|
||||
|
||||
To enable group offloading, call the [`~ModelMixin.enable_group_offload`] method on the model if it is a Diffusers model implementation. For any other model implementation, use [`~hooks.group_offloading.apply_group_offloading`]:
|
||||
> [!WARNING]
|
||||
> Group offloading may not work with all models if the forward implementation contains weight-dependent device casting of inputs because it may clash with group offloading's device casting mechanism.
|
||||
|
||||
```python
|
||||
Call [`~ModelMixin.enable_group_offload`] to enable it for standard Diffusers model components that inherit from [`ModelMixin`]. For other model components that don't inherit from [`ModelMixin`], such as a generic [torch.nn.Module](https://pytorch.org/docs/stable/generated/torch.nn.Module.html), use [`~hooks.apply_group_offloading`] instead.
|
||||
|
||||
The `offload_type` parameter can be set to `block_level` or `leaf_level`.
|
||||
|
||||
- `block_level` offloads groups of layers based on the `num_blocks_per_group` parameter. For example, if `num_blocks_per_group=2` on a model with 40 layers, 2 layers are onloaded and offloaded at a time (20 total onloads/offloads). This drastically reduces memory requirements.
|
||||
- `leaf_level` offloads individual layers at the lowest level and is equivalent to [CPU offloading](#cpu-offloading). But it can be made faster if you use streams without giving up inference speed.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import CogVideoXPipeline
|
||||
from diffusers.hooks import apply_group_offloading
|
||||
from diffusers.utils import export_to_video
|
||||
|
||||
# Load the pipeline
|
||||
onload_device = torch.device("cuda")
|
||||
offload_device = torch.device("cpu")
|
||||
pipe = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
|
||||
pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b", torch_dtype=torch.bfloat16)
|
||||
|
||||
# We can utilize the enable_group_offload method for Diffusers model implementations
|
||||
pipe.transformer.enable_group_offload(onload_device=onload_device, offload_device=offload_device, offload_type="leaf_level", use_stream=True)
|
||||
# Use the enable_group_offload method for Diffusers model implementations
|
||||
pipeline.transformer.enable_group_offload(onload_device=onload_device, offload_device=offload_device, offload_type="leaf_level")
|
||||
pipeline.vae.enable_group_offload(onload_device=onload_device, offload_type="leaf_level")
|
||||
|
||||
# Uncomment the following to also allow recording the current streams.
|
||||
# pipe.transformer.enable_group_offload(onload_device=onload_device, offload_device=offload_device, offload_type="leaf_level", use_stream=True, record_stream=True)
|
||||
|
||||
# For any other model implementations, the apply_group_offloading function can be used
|
||||
apply_group_offloading(pipe.text_encoder, onload_device=onload_device, offload_type="block_level", num_blocks_per_group=2)
|
||||
apply_group_offloading(pipe.vae, onload_device=onload_device, offload_type="leaf_level")
|
||||
# Use the apply_group_offloading method for other model components
|
||||
apply_group_offloading(pipeline.text_encoder, onload_device=onload_device, offload_type="block_level", num_blocks_per_group=2)
|
||||
|
||||
prompt = (
|
||||
"A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
|
||||
@@ -193,48 +273,55 @@ prompt = (
|
||||
"The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
|
||||
"atmosphere of this unique musical performance."
|
||||
)
|
||||
video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
|
||||
# This utilized about 14.79 GB. It can be further reduced by using tiling and using leaf_level offloading throughout the pipeline.
|
||||
video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
|
||||
print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
|
||||
export_to_video(video, "output.mp4", fps=8)
|
||||
```
|
||||
|
||||
Group offloading (for CUDA devices with support for asynchronous data transfer streams) overlaps data transfer and computation to reduce the overall execution time compared to sequential offloading. This is enabled using layer prefetching with CUDA streams. The next layer to be executed is loaded onto the accelerator device while the current layer is being executed - this increases the memory requirements slightly. Group offloading also supports leaf-level offloading (equivalent to sequential CPU offloading) but can be made much faster when using streams.
|
||||
### CUDA stream
|
||||
|
||||
<Tip>
|
||||
The `use_stream` parameter can be activated for CUDA devices that support asynchronous data transfer streams to reduce overall execution time compared to [CPU offloading](#cpu-offloading). It overlaps data transfer and computation by using layer prefetching. The next layer to be executed is loaded onto the GPU while the current layer is still being executed. It can increase CPU memory significantly so ensure you have 2x the amount of memory as the model size.
|
||||
|
||||
- Group offloading may not work with all models out-of-the-box. If the forward implementations of the model contain weight-dependent device-casting of inputs, it may clash with the offloading mechanism's handling of device-casting.
|
||||
- The `offload_type` parameter can be set to either `block_level` or `leaf_level`. `block_level` offloads groups of `torch::nn::ModuleList` or `torch::nn:Sequential` modules based on a configurable attribute `num_blocks_per_group`. For example, if you set `num_blocks_per_group=2` on a standard transformer model containing 40 layers, it will onload/offload 2 layers at a time for a total of 20 onload/offloads. This drastically reduces the VRAM requirements. `leaf_level` offloads individual layers at the lowest level, which is equivalent to sequential offloading. However, unlike sequential offloading, group offloading can be made much faster when using streams, with minimal compromise to end-to-end generation time.
|
||||
- The `use_stream` parameter can be used with CUDA devices to enable prefetching layers for onload. It defaults to `False`. Layer prefetching allows overlapping computation and data transfer of model weights, which drastically reduces the overall execution time compared to other offloading methods. However, it can increase the CPU RAM usage significantly. Ensure that available CPU RAM that is at least twice the size of the model when setting `use_stream=True`. You can find more information about CUDA streams [here](https://pytorch.org/docs/stable/generated/torch.cuda.Stream.html)
|
||||
- If specifying `use_stream=True` on VAEs with tiling enabled, make sure to do a dummy forward pass (possibly with dummy inputs) before the actual inference to avoid device-mismatch errors. This may not work on all implementations. Please open an issue if you encounter any problems.
|
||||
- The parameter `low_cpu_mem_usage` can be set to `True` to reduce CPU memory usage when using streams for group offloading. This is useful when the CPU memory is the bottleneck, but it may counteract the benefits of using streams and increase the overall execution time. The CPU memory savings come from creating pinned-tensors on-the-fly instead of pre-pinning them. This parameter is better suited for using `leaf_level` offloading.
|
||||
- When using `use_stream=True`, users can additionally specify `record_stream=True` to get better speedups at the expense of slightly increased memory usage. Refer to the [official PyTorch docs](https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html) to know more about this.
|
||||
Set `record_stream=True` for more of a speedup at the cost of slightly increased memory usage. Refer to the [torch.Tensor.record_stream](https://pytorch.org/docs/stable/generated/torch.Tensor.record_stream.html) docs to learn more.
|
||||
|
||||
For more information about available parameters and an explanation of how group offloading works, refer to [`~hooks.group_offloading.apply_group_offloading`].
|
||||
> [!TIP]
|
||||
> When `use_stream=True` on VAEs with tiling enabled, make sure to do a dummy forward pass (possible with dummy inputs as well) before inference to avoid device mismatch errors. This may not work on all implementations, so feel free to open an issue if you encounter any problems.
|
||||
|
||||
</Tip>
|
||||
If you're using `block_level` group offloading with `use_stream` enabled, the `num_blocks_per_group` parameter should be set to `1`, otherwise a warning will be raised.
|
||||
|
||||
## FP8 layerwise weight-casting
|
||||
```py
|
||||
pipeline.transformer.enable_group_offload(onload_device=onload_device, offload_device=offload_device, offload_type="leaf_level", use_stream=True, record_stream=True)
|
||||
```
|
||||
|
||||
PyTorch supports `torch.float8_e4m3fn` and `torch.float8_e5m2` as weight storage dtypes, but they can't be used for computation in many different tensor operations due to unimplemented kernel support. However, you can use these dtypes to store model weights in fp8 precision and upcast them on-the-fly when the layers are used in the forward pass. This is known as layerwise weight-casting.
|
||||
The `low_cpu_mem_usage` parameter can be set to `True` to reduce CPU memory usage when using streams during group offloading. It is best for `leaf_level` offloading and when CPU memory is bottlenecked. Memory is saved by creating pinned tensors on the fly instead of pre-pinning them. However, this may increase overall execution time.
|
||||
|
||||
Typically, inference on most models is done with `torch.float16` or `torch.bfloat16` weight/computation precision. Layerwise weight-casting cuts down the memory footprint of the model weights by approximately half.
|
||||
## Layerwise casting
|
||||
|
||||
```python
|
||||
Layerwise casting stores weights in a smaller data format (for example, `torch.float8_e4m3fn` and `torch.float8_e5m2`) to use less memory and upcasts those weights to a higher precision like `torch.float16` or `torch.bfloat16` for computation. Certain layers (normalization and modulation related weights) are skipped because storing them in fp8 can degrade generation quality.
|
||||
|
||||
> [!WARNING]
|
||||
> Layerwise casting may not work with all models if the forward implementation contains internal typecasting of weights. The current implementation of layerwise casting assumes the forward pass is independent of the weight precision and the input datatypes are always specified in `compute_dtype` (see [here](https://github.com/huggingface/transformers/blob/7f5077e53682ca855afc826162b204ebf809f1f9/src/transformers/models/t5/modeling_t5.py#L294-L299) for an incompatible implementation).
|
||||
>
|
||||
> Layerwise casting may also fail on custom modeling implementations with [PEFT](https://huggingface.co/docs/peft/index) layers. There are some checks available but they are not extensively tested or guaranteed to work in all cases.
|
||||
|
||||
Call [`~ModelMixin.enable_layerwise_casting`] to set the storage and computation datatypes.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import CogVideoXPipeline, CogVideoXTransformer3DModel
|
||||
from diffusers.utils import export_to_video
|
||||
|
||||
model_id = "THUDM/CogVideoX-5b"
|
||||
|
||||
# Load the model in bfloat16 and enable layerwise casting
|
||||
transformer = CogVideoXTransformer3DModel.from_pretrained(model_id, subfolder="transformer", torch_dtype=torch.bfloat16)
|
||||
transformer = CogVideoXTransformer3DModel.from_pretrained(
|
||||
"THUDM/CogVideoX-5b",
|
||||
subfolder="transformer",
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
transformer.enable_layerwise_casting(storage_dtype=torch.float8_e4m3fn, compute_dtype=torch.bfloat16)
|
||||
|
||||
# Load the pipeline
|
||||
pipe = CogVideoXPipeline.from_pretrained(model_id, transformer=transformer, torch_dtype=torch.bfloat16)
|
||||
pipe.to("cuda")
|
||||
|
||||
pipeline = CogVideoXPipeline.from_pretrained("THUDM/CogVideoX-5b",
|
||||
transformer=transformer,
|
||||
torch_dtype=torch.bfloat16
|
||||
).to("cuda")
|
||||
prompt = (
|
||||
"A panda, dressed in a small, red jacket and a tiny hat, sits on a wooden stool in a serene bamboo forest. "
|
||||
"The panda's fluffy paws strum a miniature acoustic guitar, producing soft, melodic tunes. Nearby, a few other "
|
||||
@@ -243,43 +330,53 @@ prompt = (
|
||||
"The background includes a small, flowing stream and vibrant green foliage, enhancing the peaceful and magical "
|
||||
"atmosphere of this unique musical performance."
|
||||
)
|
||||
video = pipe(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
|
||||
video = pipeline(prompt=prompt, guidance_scale=6, num_inference_steps=50).frames[0]
|
||||
print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
|
||||
export_to_video(video, "output.mp4", fps=8)
|
||||
```
|
||||
|
||||
In the above example, layerwise casting is enabled on the transformer component of the pipeline. By default, certain layers are skipped from the FP8 weight casting because it can lead to significant degradation of generation quality. The normalization and modulation related weight parameters are also skipped by default.
|
||||
|
||||
However, you gain more control and flexibility by directly utilizing the [`~hooks.layerwise_casting.apply_layerwise_casting`] function instead of [`~ModelMixin.enable_layerwise_casting`].
|
||||
|
||||
<Tip>
|
||||
|
||||
- Layerwise casting may not work with all models out-of-the-box. Sometimes, the forward implementations of the model might contain internal typecasting of weight values. Such implementations are not supported due to the currently simplistic implementation of layerwise casting, which assumes that the forward pass is independent of the weight precision and that the input dtypes are always in `compute_dtype`. An example of an incompatible implementation can be found [here](https://github.com/huggingface/transformers/blob/7f5077e53682ca855afc826162b204ebf809f1f9/src/transformers/models/t5/modeling_t5.py#L294-L299).
|
||||
- Layerwise casting may fail on custom modeling implementations that make use of [PEFT](https://github.com/huggingface/peft) layers. Some minimal checks to handle this case is implemented but is not extensively tested or guaranteed to work in all cases.
|
||||
- It can be also be applied partially to specific layers of a model. Partially applying layerwise casting can either be done manually by calling the `apply_layerwise_casting` function on specific internal modules, or by specifying the `skip_modules_pattern` and `skip_modules_classes` parameters for a root module. These parameters are particularly useful for layers such as normalization and modulation.
|
||||
|
||||
</Tip>
|
||||
|
||||
## Channels-last memory format
|
||||
|
||||
The channels-last memory format is an alternative way of ordering NCHW tensors in memory to preserve dimension ordering. Channels-last tensors are ordered in such a way that the channels become the densest dimension (storing images pixel-per-pixel). Since not all operators currently support the channels-last format, it may result in worst performance but you should still try and see if it works for your model.
|
||||
|
||||
For example, to set the pipeline's UNet to use the channels-last format:
|
||||
The [`~hooks.apply_layerwise_casting`] method can also be used if you need more control and flexibility. It can be partially applied to model layers by calling it on specific internal modules. Use the `skip_modules_pattern` or `skip_modules_classes` parameters to specify modules to avoid, such as the normalization and modulation layers.
|
||||
|
||||
```python
|
||||
print(pipe.unet.conv_out.state_dict()["weight"].stride()) # (2880, 9, 3, 1)
|
||||
pipe.unet.to(memory_format=torch.channels_last) # in-place operation
|
||||
import torch
|
||||
from diffusers import CogVideoXTransformer3DModel
|
||||
from diffusers.hooks import apply_layerwise_casting
|
||||
|
||||
transformer = CogVideoXTransformer3DModel.from_pretrained(
|
||||
"THUDM/CogVideoX-5b",
|
||||
subfolder="transformer",
|
||||
torch_dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
# skip the normalization layer
|
||||
apply_layerwise_casting(
|
||||
transformer,
|
||||
storage_dtype=torch.float8_e4m3fn,
|
||||
compute_dtype=torch.bfloat16,
|
||||
skip_modules_classes=["norm"],
|
||||
non_blocking=True,
|
||||
)
|
||||
```
|
||||
|
||||
## torch.channels_last
|
||||
|
||||
[torch.channels_last](https://pytorch.org/tutorials/intermediate/memory_format_tutorial.html) flips how tensors are stored from `(batch size, channels, height, width)` to `(batch size, heigh, width, channels)`. This aligns the tensors with how the hardware sequentially accesses the tensors stored in memory and avoids skipping around in memory to access the pixel values.
|
||||
|
||||
Not all operators currently support the channels-last format and may result in worst performance, but it is still worth trying.
|
||||
|
||||
```py
|
||||
print(pipeline.unet.conv_out.state_dict()["weight"].stride()) # (2880, 9, 3, 1)
|
||||
pipeline.unet.to(memory_format=torch.channels_last) # in-place operation
|
||||
print(
|
||||
pipe.unet.conv_out.state_dict()["weight"].stride()
|
||||
pipeline.unet.conv_out.state_dict()["weight"].stride()
|
||||
) # (2880, 1, 960, 320) having a stride of 1 for the 2nd dimension proves that it works
|
||||
```
|
||||
|
||||
## Tracing
|
||||
## torch.jit.trace
|
||||
|
||||
Tracing runs an example input tensor through the model and captures the operations that are performed on it as that input makes its way through the model's layers. The executable or `ScriptFunction` that is returned is optimized with just-in-time compilation.
|
||||
[torch.jit.trace](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) records the operations a model performs on a sample input and creates a new, optimized representation of the model based on the recorded execution path. During tracing, the model is optimized to reduce overhead from Python and dynamic control flows and operations are fused together for more efficiency. The returned executable or [ScriptFunction](https://pytorch.org/docs/stable/generated/torch.jit.ScriptFunction.html) can be compiled.
|
||||
|
||||
To trace a UNet:
|
||||
|
||||
```python
|
||||
```py
|
||||
import time
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
@@ -292,8 +389,7 @@ torch.set_grad_enabled(False)
|
||||
n_experiments = 2
|
||||
unet_runs_per_experiment = 50
|
||||
|
||||
|
||||
# load inputs
|
||||
# load sample inputs
|
||||
def generate_inputs():
|
||||
sample = torch.randn((2, 4, 64, 64), device="cuda", dtype=torch.float16)
|
||||
timestep = torch.rand(1, device="cuda", dtype=torch.float16) * 999
|
||||
@@ -301,12 +397,12 @@ def generate_inputs():
|
||||
return sample, timestep, encoder_hidden_states
|
||||
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
pipeline = StableDiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
).to("cuda")
|
||||
unet = pipe.unet
|
||||
unet = pipeline.unet
|
||||
unet.eval()
|
||||
unet.to(memory_format=torch.channels_last) # use channels_last memory format
|
||||
unet.forward = functools.partial(unet.forward, return_dict=False) # set return_dict=False as default
|
||||
@@ -323,14 +419,12 @@ unet_traced = torch.jit.trace(unet, inputs)
|
||||
unet_traced.eval()
|
||||
print("done tracing")
|
||||
|
||||
|
||||
# warmup and optimize graph
|
||||
for _ in range(5):
|
||||
with torch.inference_mode():
|
||||
inputs = generate_inputs()
|
||||
orig_output = unet_traced(*inputs)
|
||||
|
||||
|
||||
# benchmarking
|
||||
with torch.inference_mode():
|
||||
for _ in range(n_experiments):
|
||||
@@ -352,20 +446,18 @@ with torch.inference_mode():
|
||||
unet_traced.save("unet_traced.pt")
|
||||
```
|
||||
|
||||
Replace the `unet` attribute of the pipeline with the traced model:
|
||||
Replace the pipeline's UNet with the traced version.
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class UNet2DConditionOutput:
|
||||
sample: torch.Tensor
|
||||
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained(
|
||||
pipeline = StableDiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
@@ -374,8 +466,7 @@ pipe = StableDiffusionPipeline.from_pretrained(
|
||||
# use jitted unet
|
||||
unet_traced = torch.jit.load("unet_traced.pt")
|
||||
|
||||
|
||||
# del pipe.unet
|
||||
# del pipeline.unet
|
||||
class TracedUNet(torch.nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
@@ -386,8 +477,7 @@ class TracedUNet(torch.nn.Module):
|
||||
sample = unet_traced(latent_model_input, t, encoder_hidden_states)[0]
|
||||
return UNet2DConditionOutput(sample=sample)
|
||||
|
||||
|
||||
pipe.unet = TracedUNet()
|
||||
pipeline.unet = TracedUNet()
|
||||
|
||||
with torch.inference_mode():
|
||||
image = pipe([prompt] * 1, num_inference_steps=50).images[0]
|
||||
@@ -395,39 +485,31 @@ with torch.inference_mode():
|
||||
|
||||
## Memory-efficient attention
|
||||
|
||||
Recent work on optimizing bandwidth in the attention block has generated huge speed-ups and reductions in GPU memory usage. The most recent type of memory-efficient attention is [Flash Attention](https://arxiv.org/abs/2205.14135) (you can check out the original code at [HazyResearch/flash-attention](https://github.com/HazyResearch/flash-attention)).
|
||||
> [!TIP]
|
||||
> Memory-efficient attention optimizes for memory usage *and* [inference speed](./fp16#scaled-dot-product-attention!
|
||||
|
||||
<Tip>
|
||||
The Transformers attention mechanism is memory-intensive, especially for long sequences, so you can try using different and more memory-efficient attention types.
|
||||
|
||||
If you have PyTorch >= 2.0 installed, you should not expect a speed-up for inference when enabling `xformers`.
|
||||
By default, if PyTorch >= 2.0 is installed, [scaled dot-product attention (SDPA)](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) is used. You don't need to make any additional changes to your code.
|
||||
|
||||
</Tip>
|
||||
SDPA supports [FlashAttention](https://github.com/Dao-AILab/flash-attention) and [xFormers](https://github.com/facebookresearch/xformers) as well as a native C++ PyTorch implementation. It automatically selects the most optimal implementation based on your input.
|
||||
|
||||
To use Flash Attention, install the following:
|
||||
You can explicitly use xFormers with the [`~ModelMixin.enable_xformers_memory_efficient_attention`] method.
|
||||
|
||||
- PyTorch > 1.12
|
||||
- CUDA available
|
||||
- [xFormers](xformers)
|
||||
|
||||
Then call [`~ModelMixin.enable_xformers_memory_efficient_attention`] on the pipeline:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
```py
|
||||
# pip install xformers
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
).to("cuda")
|
||||
|
||||
pipe.enable_xformers_memory_efficient_attention()
|
||||
|
||||
with torch.inference_mode():
|
||||
sample = pipe("a small cat")
|
||||
|
||||
# optional: You can disable it via
|
||||
# pipe.disable_xformers_memory_efficient_attention()
|
||||
pipeline.enable_xformers_memory_efficient_attention()
|
||||
```
|
||||
|
||||
The iteration speed when using `xformers` should match the iteration speed of PyTorch 2.0 as described [here](torch2.0).
|
||||
Call [`~ModelMixin.disable_xformers_memory_efficient_attention`] to disable it.
|
||||
|
||||
```py
|
||||
pipeline.disable_xformers_memory_efficient_attention()
|
||||
```
|
||||
@@ -85,7 +85,7 @@ The quantization methods supported are as follows:
|
||||
| **Category** | **Full Function Names** | **Shorthands** |
|
||||
|--------------|-------------------------|----------------|
|
||||
| **Integer quantization** | `int4_weight_only`, `int8_dynamic_activation_int4_weight`, `int8_weight_only`, `int8_dynamic_activation_int8_weight` | `int4wo`, `int4dq`, `int8wo`, `int8dq` |
|
||||
| **Floating point 8-bit quantization** | `float8_weight_only`, `float8_dynamic_activation_float8_weight`, `float8_static_activation_float8_weight` | `float8wo`, `float8wo_e5m2`, `float8wo_e4m3`, `float8dq`, `float8dq_e4m3`, `float8_e4m3_tensor`, `float8_e4m3_row` |
|
||||
| **Floating point 8-bit quantization** | `float8_weight_only`, `float8_dynamic_activation_float8_weight`, `float8_static_activation_float8_weight` | `float8wo`, `float8wo_e5m2`, `float8wo_e4m3`, `float8dq`, `float8dq_e4m3`, `float8dq_e4m3_tensor`, `float8dq_e4m3_row` |
|
||||
| **Floating point X-bit quantization** | `fpx_weight_only` | `fpX_eAwB` where `X` is the number of bits (1-7), `A` is exponent bits, and `B` is mantissa bits. Constraint: `X == A + B + 1` |
|
||||
| **Unsigned Integer quantization** | `uintx_weight_only` | `uint1wo`, `uint2wo`, `uint3wo`, `uint4wo`, `uint5wo`, `uint6wo`, `uint7wo` |
|
||||
|
||||
|
||||
@@ -1,139 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Working with big models
|
||||
|
||||
A modern diffusion model, like [Stable Diffusion XL (SDXL)](../using-diffusers/sdxl), is not just a single model, but a collection of multiple models. SDXL has four different model-level components:
|
||||
|
||||
* A variational autoencoder (VAE)
|
||||
* Two text encoders
|
||||
* A UNet for denoising
|
||||
|
||||
Usually, the text encoders and the denoiser are much larger compared to the VAE.
|
||||
|
||||
As models get bigger and better, it’s possible your model is so big that even a single copy won’t fit in memory. But that doesn’t mean it can’t be loaded. If you have more than one GPU, there is more memory available to store your model. In this case, it’s better to split your model checkpoint into several smaller *checkpoint shards*.
|
||||
|
||||
When a text encoder checkpoint has multiple shards, like [T5-xxl for SD3](https://huggingface.co/stabilityai/stable-diffusion-3-medium-diffusers/tree/main/text_encoder_3), it is automatically handled by the [Transformers](https://huggingface.co/docs/transformers/index) library as it is a required dependency of Diffusers when using the [`StableDiffusion3Pipeline`]. More specifically, Transformers will automatically handle the loading of multiple shards within the requested model class and get it ready so that inference can be performed.
|
||||
|
||||
The denoiser checkpoint can also have multiple shards and supports inference thanks to the [Accelerate](https://huggingface.co/docs/accelerate/index) library.
|
||||
|
||||
> [!TIP]
|
||||
> Refer to the [Handling big models for inference](https://huggingface.co/docs/accelerate/main/en/concept_guides/big_model_inference) guide for general guidance when working with big models that are hard to fit into memory.
|
||||
|
||||
For example, let's save a sharded checkpoint for the [SDXL UNet](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/tree/main/unet):
|
||||
|
||||
```python
|
||||
from diffusers import AutoModel
|
||||
|
||||
unet = AutoModel.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", subfolder="unet"
|
||||
)
|
||||
unet.save_pretrained("sdxl-unet-sharded", max_shard_size="5GB")
|
||||
```
|
||||
|
||||
The size of the fp32 variant of the SDXL UNet checkpoint is ~10.4GB. Set the `max_shard_size` parameter to 5GB to create 3 shards. After saving, you can load them in [`StableDiffusionXLPipeline`]:
|
||||
|
||||
```python
|
||||
from diffusers import AutoModel, StableDiffusionXLPipeline
|
||||
import torch
|
||||
|
||||
unet = AutoModel.from_pretrained(
|
||||
"sayakpaul/sdxl-unet-sharded", torch_dtype=torch.float16
|
||||
)
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", unet=unet, torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
image = pipeline("a cute dog running on the grass", num_inference_steps=30).images[0]
|
||||
image.save("dog.png")
|
||||
```
|
||||
|
||||
If placing all the model-level components on the GPU at once is not feasible, use [`~DiffusionPipeline.enable_model_cpu_offload`] to help you:
|
||||
|
||||
```diff
|
||||
- pipeline.to("cuda")
|
||||
+ pipeline.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
In general, we recommend sharding when a checkpoint is more than 5GB (in fp32).
|
||||
|
||||
## Device placement
|
||||
|
||||
On distributed setups, you can run inference across multiple GPUs with Accelerate.
|
||||
|
||||
> [!WARNING]
|
||||
> This feature is experimental and its APIs might change in the future.
|
||||
|
||||
With Accelerate, you can use the `device_map` to determine how to distribute the models of a pipeline across multiple devices. This is useful in situations where you have more than one GPU.
|
||||
|
||||
For example, if you have two 8GB GPUs, then using [`~DiffusionPipeline.enable_model_cpu_offload`] may not work so well because:
|
||||
|
||||
* it only works on a single GPU
|
||||
* a single model might not fit on a single GPU ([`~DiffusionPipeline.enable_sequential_cpu_offload`] might work but it will be extremely slow and it is also limited to a single GPU)
|
||||
|
||||
To make use of both GPUs, you can use the "balanced" device placement strategy which splits the models across all available GPUs.
|
||||
|
||||
> [!WARNING]
|
||||
> Only the "balanced" strategy is supported at the moment, and we plan to support additional mapping strategies in the future.
|
||||
|
||||
```diff
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
- "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True,
|
||||
+ "stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True, device_map="balanced"
|
||||
)
|
||||
image = pipeline("a dog").images[0]
|
||||
image
|
||||
```
|
||||
|
||||
You can also pass a dictionary to enforce the maximum GPU memory that can be used on each device:
|
||||
|
||||
```diff
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
max_memory = {0:"1GB", 1:"1GB"}
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
device_map="balanced",
|
||||
+ max_memory=max_memory
|
||||
)
|
||||
image = pipeline("a dog").images[0]
|
||||
image
|
||||
```
|
||||
|
||||
If a device is not present in `max_memory`, then it will be completely ignored and will not participate in the device placement.
|
||||
|
||||
By default, Diffusers uses the maximum memory of all devices. If the models don't fit on the GPUs, they are offloaded to the CPU. If the CPU doesn't have enough memory, then you might see an error. In that case, you could defer to using [`~DiffusionPipeline.enable_sequential_cpu_offload`] and [`~DiffusionPipeline.enable_model_cpu_offload`].
|
||||
|
||||
Call [`~DiffusionPipeline.reset_device_map`] to reset the `device_map` of a pipeline. This is also necessary if you want to use methods like `to()`, [`~DiffusionPipeline.enable_sequential_cpu_offload`], and [`~DiffusionPipeline.enable_model_cpu_offload`] on a pipeline that was device-mapped.
|
||||
|
||||
```py
|
||||
pipeline.reset_device_map()
|
||||
```
|
||||
|
||||
Once a pipeline has been device-mapped, you can also access its device map via `hf_device_map`:
|
||||
|
||||
```py
|
||||
print(pipeline.hf_device_map)
|
||||
```
|
||||
|
||||
An example device map would look like so:
|
||||
|
||||
|
||||
```bash
|
||||
{'unet': 1, 'vae': 1, 'safety_checker': 0, 'text_encoder': 0}
|
||||
```
|
||||
@@ -10,218 +10,625 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
[[open-in-colab]]
|
||||
# LoRA
|
||||
|
||||
# Load LoRAs for inference
|
||||
[LoRA (Low-Rank Adaptation)](https://huggingface.co/papers/2106.09685) is a method for quickly training a model for a new task. It works by freezing the original model weights and adding a small number of *new* trainable parameters. This means it is significantly faster and cheaper to adapt an existing model to new tasks, such as generating images in a new style.
|
||||
|
||||
There are many adapter types (with [LoRAs](https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora) being the most popular) trained in different styles to achieve different effects. You can even combine multiple adapters to create new and unique images.
|
||||
LoRA checkpoints are typically only a couple hundred MBs in size, so they're very lightweight and easy to store. Load these smaller set of weights into an existing base model with [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] and specify the file name.
|
||||
|
||||
In this tutorial, you'll learn how to easily load and manage adapters for inference with the 🤗 [PEFT](https://huggingface.co/docs/peft/index) integration in 🤗 Diffusers. You'll use LoRA as the main adapter technique, so you'll see the terms LoRA and adapter used interchangeably.
|
||||
<hfoptions id="usage">
|
||||
<hfoption id="text-to-image">
|
||||
|
||||
Let's first install all the required libraries.
|
||||
|
||||
```bash
|
||||
!pip install -q transformers accelerate peft diffusers
|
||||
```
|
||||
|
||||
Now, load a pipeline with a [Stable Diffusion XL (SDXL)](../api/pipelines/stable_diffusion/stable_diffusion_xl) checkpoint:
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
|
||||
pipe_id = "stabilityai/stable-diffusion-xl-base-1.0"
|
||||
pipe = DiffusionPipeline.from_pretrained(pipe_id, torch_dtype=torch.float16).to("cuda")
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.load_lora_weights(
|
||||
"ostris/super-cereal-sdxl-lora",
|
||||
weight_name="cereal_box_sdxl_v1.safetensors",
|
||||
adapter_name="cereal"
|
||||
)
|
||||
pipeline("bears, pizza bites").images[0]
|
||||
```
|
||||
|
||||
Next, load a [CiroN2022/toy-face](https://huggingface.co/CiroN2022/toy-face) adapter with the [`~diffusers.loaders.StableDiffusionXLLoraLoaderMixin.load_lora_weights`] method. With the 🤗 PEFT integration, you can assign a specific `adapter_name` to the checkpoint, which lets you easily switch between different LoRA checkpoints. Let's call this adapter `"toy"`.
|
||||
</hfoption>
|
||||
<hfoption id="text-to-video">
|
||||
|
||||
```python
|
||||
pipe.load_lora_weights("CiroN2022/toy-face", weight_name="toy_face_sdxl.safetensors", adapter_name="toy")
|
||||
```py
|
||||
import torch
|
||||
from diffusers import LTXConditionPipeline
|
||||
from diffusers.utils import export_to_video, load_image
|
||||
|
||||
pipeline = LTXConditionPipeline.from_pretrained(
|
||||
"Lightricks/LTX-Video-0.9.5", torch_dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
pipeline.load_lora_weights(
|
||||
"Lightricks/LTX-Video-Cakeify-LoRA",
|
||||
weight_name="ltxv_095_cakeify_lora.safetensors",
|
||||
adapter_name="cakeify"
|
||||
)
|
||||
pipeline.set_adapters("cakeify")
|
||||
|
||||
# use "CAKEIFY" to trigger the LoRA
|
||||
prompt = "CAKEIFY a person using a knife to cut a cake shaped like a Pikachu plushie"
|
||||
image = load_image("https://huggingface.co/Lightricks/LTX-Video-Cakeify-LoRA/resolve/main/assets/images/pikachu.png")
|
||||
|
||||
video = pipeline(
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
width=576,
|
||||
height=576,
|
||||
num_frames=161,
|
||||
decode_timestep=0.03,
|
||||
decode_noise_scale=0.025,
|
||||
num_inference_steps=50,
|
||||
).frames[0]
|
||||
export_to_video(video, "output.mp4", fps=26)
|
||||
```
|
||||
|
||||
Make sure to include the token `toy_face` in the prompt and then you can perform inference:
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
```python
|
||||
prompt = "toy_face of a hacker with a hoodie"
|
||||
The [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method is the preferred way to load LoRA weights into the UNet and text encoder because it can handle cases where:
|
||||
|
||||
lora_scale = 0.9
|
||||
image = pipe(
|
||||
prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
|
||||
).images[0]
|
||||
image
|
||||
- the LoRA weights don't have separate UNet and text encoder identifiers
|
||||
- the LoRA weights have separate UNet and text encoder identifiers
|
||||
|
||||
The [`~loaders.PeftAdapterMixin.load_lora_adapter`] method is used to directly load a LoRA adapter at the *model-level*, as long as the model is a Diffusers model that is a subclass of [`PeftAdapterMixin`]. It builds and prepares the necessary model configuration for the adapter. This method also loads the LoRA adapter into the UNet.
|
||||
|
||||
For example, if you're only loading a LoRA into the UNet, [`~loaders.PeftAdapterMixin.load_lora_adapter`] ignores the text encoder keys. Use the `prefix` parameter to filter and load the appropriate state dicts, `"unet"` to load.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.unet.load_lora_adapter(
|
||||
"jbilcke-hf/sdxl-cinematic-1",
|
||||
weight_name="pytorch_lora_weights.safetensors",
|
||||
adapter_name="cinematic"
|
||||
prefix="unet"
|
||||
)
|
||||
# use cnmt in the prompt to trigger the LoRA
|
||||
pipeline("A cute cnmt eating a slice of pizza, stunning color scheme, masterpiece, illustration").images[0]
|
||||
```
|
||||
|
||||

|
||||
## torch.compile
|
||||
|
||||
With the `adapter_name` parameter, it is really easy to use another adapter for inference! Load the [nerijs/pixel-art-xl](https://huggingface.co/nerijs/pixel-art-xl) adapter that has been fine-tuned to generate pixel art images and call it `"pixel"`.
|
||||
[torch.compile](../optimization/torch2.0#torchcompile) speeds up inference by compiling the PyTorch model to use optimized kernels. Before compiling, the LoRA weights need to be fused into the base model and unloaded first.
|
||||
|
||||
The pipeline automatically sets the first loaded adapter (`"toy"`) as the active adapter, but you can activate the `"pixel"` adapter with the [`~loaders.peft.PeftAdapterMixin.set_adapters`] method:
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
```python
|
||||
pipe.load_lora_weights("nerijs/pixel-art-xl", weight_name="pixel-art-xl.safetensors", adapter_name="pixel")
|
||||
pipe.set_adapters("pixel")
|
||||
# load base model and LoRA
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.load_lora_weights(
|
||||
"ostris/ikea-instructions-lora-sdxl",
|
||||
weight_name="ikea_instructions_xl_v1_5.safetensors",
|
||||
adapter_name="ikea"
|
||||
)
|
||||
|
||||
# activate LoRA and set adapter weight
|
||||
pipeline.set_adapters("ikea", adapter_weights=0.7)
|
||||
|
||||
# fuse LoRAs and unload weights
|
||||
pipeline.fuse_lora(adapter_names=["ikea"], lora_scale=1.0)
|
||||
pipeline.unload_lora_weights()
|
||||
```
|
||||
|
||||
Make sure you include the token `pixel art` in your prompt to generate a pixel art image:
|
||||
Typically, the UNet is compiled because its the most compute intensive component of the pipeline.
|
||||
|
||||
```python
|
||||
prompt = "a hacker with a hoodie, pixel art"
|
||||
image = pipe(
|
||||
prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
|
||||
).images[0]
|
||||
image
|
||||
```py
|
||||
pipeline.unet.to(memory_format=torch.channels_last)
|
||||
pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
pipeline("A bowl of ramen shaped like a cute kawaii bear").images[0]
|
||||
```
|
||||
|
||||

|
||||
Refer to the [hotswapping](#hotswapping) section to learn how to avoid recompilation when working with compiled models and multiple LoRAs.
|
||||
|
||||
<Tip>
|
||||
## Weight scale
|
||||
|
||||
By default, if the most up-to-date versions of PEFT and Transformers are detected, `low_cpu_mem_usage` is set to `True` to speed up the loading time of LoRA checkpoints.
|
||||
The `scale` parameter is used to control how much of a LoRA to apply. A value of `0` is equivalent to only using the base model weights and a value of `1` is equivalent to fully using the LoRA.
|
||||
|
||||
</Tip>
|
||||
<hfoptions id="weight-scale">
|
||||
<hfoption id="simple use case">
|
||||
|
||||
## Merge adapters
|
||||
For simple use cases, you can pass `cross_attention_kwargs={"scale": 1.0}` to the pipeline.
|
||||
|
||||
You can also merge different adapter checkpoints for inference to blend their styles together.
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
|
||||
Once again, use the [`~loaders.peft.PeftAdapterMixin.set_adapters`] method to activate the `pixel` and `toy` adapters and specify the weights for how they should be merged.
|
||||
|
||||
```python
|
||||
pipe.set_adapters(["pixel", "toy"], adapter_weights=[0.5, 1.0])
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.load_lora_weights(
|
||||
"ostris/super-cereal-sdxl-lora",
|
||||
weight_name="cereal_box_sdxl_v1.safetensors",
|
||||
adapter_name="cereal"
|
||||
)
|
||||
pipeline("bears, pizza bites", cross_attention_kwargs={"scale": 1.0}).images[0]
|
||||
```
|
||||
|
||||
<Tip>
|
||||
</hfoption>
|
||||
<hfoption id="finer control">
|
||||
|
||||
LoRA checkpoints in the diffusion community are almost always obtained with [DreamBooth](https://huggingface.co/docs/diffusers/main/en/training/dreambooth). DreamBooth training often relies on "trigger" words in the input text prompts in order for the generation results to look as expected. When you combine multiple LoRA checkpoints, it's important to ensure the trigger words for the corresponding LoRA checkpoints are present in the input text prompts.
|
||||
> [!WARNING]
|
||||
> The [`~loaders.PeftAdapterMixin.set_adapters`] method only scales attention weights. If a LoRA has ResNets or down and upsamplers, these components keep a scale value of `1.0`.
|
||||
|
||||
</Tip>
|
||||
For finer control over each individual component of the UNet or text encoder, pass a dictionary instead. In the example below, the `"down"` block in the UNet is scaled by 0.9 and you can further specify in the `"up"` block the scales of the transformers in `"block_0"` and `"block_1"`. If a block like `"mid"` isn't specified, the default value `1.0` is used.
|
||||
|
||||
Remember to use the trigger words for [CiroN2022/toy-face](https://hf.co/CiroN2022/toy-face) and [nerijs/pixel-art-xl](https://hf.co/nerijs/pixel-art-xl) (these are found in their repositories) in the prompt to generate an image.
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
|
||||
```python
|
||||
prompt = "toy_face of a hacker with a hoodie, pixel art"
|
||||
image = pipe(
|
||||
prompt, num_inference_steps=30, cross_attention_kwargs={"scale": 1.0}, generator=torch.manual_seed(0)
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
Impressive! As you can see, the model generated an image that mixed the characteristics of both adapters.
|
||||
|
||||
> [!TIP]
|
||||
> Through its PEFT integration, Diffusers also offers more efficient merging methods which you can learn about in the [Merge LoRAs](../using-diffusers/merge_loras) guide!
|
||||
|
||||
To return to only using one adapter, use the [`~loaders.peft.PeftAdapterMixin.set_adapters`] method to activate the `"toy"` adapter:
|
||||
|
||||
```python
|
||||
pipe.set_adapters("toy")
|
||||
|
||||
prompt = "toy_face of a hacker with a hoodie"
|
||||
lora_scale = 0.9
|
||||
image = pipe(
|
||||
prompt, num_inference_steps=30, cross_attention_kwargs={"scale": lora_scale}, generator=torch.manual_seed(0)
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
Or to disable all adapters entirely, use the [`~loaders.peft.PeftAdapterMixin.disable_lora`] method to return the base model.
|
||||
|
||||
```python
|
||||
pipe.disable_lora()
|
||||
|
||||
prompt = "toy_face of a hacker with a hoodie"
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
### Customize adapters strength
|
||||
|
||||
For even more customization, you can control how strongly the adapter affects each part of the pipeline. For this, pass a dictionary with the control strengths (called "scales") to [`~loaders.peft.PeftAdapterMixin.set_adapters`].
|
||||
|
||||
For example, here's how you can turn on the adapter for the `down` parts, but turn it off for the `mid` and `up` parts:
|
||||
```python
|
||||
pipe.enable_lora() # enable lora again, after we disabled it above
|
||||
prompt = "toy_face of a hacker with a hoodie, pixel art"
|
||||
adapter_weight_scales = { "unet": { "down": 1, "mid": 0, "up": 0} }
|
||||
pipe.set_adapters("pixel", adapter_weight_scales)
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
Let's see how turning off the `down` part and turning on the `mid` and `up` part respectively changes the image.
|
||||
```python
|
||||
adapter_weight_scales = { "unet": { "down": 0, "mid": 1, "up": 0} }
|
||||
pipe.set_adapters("pixel", adapter_weight_scales)
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
```python
|
||||
adapter_weight_scales = { "unet": { "down": 0, "mid": 0, "up": 1} }
|
||||
pipe.set_adapters("pixel", adapter_weight_scales)
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||

|
||||
|
||||
Looks cool!
|
||||
|
||||
This is a really powerful feature. You can use it to control the adapter strengths down to per-transformer level. And you can even use it for multiple adapters.
|
||||
```python
|
||||
adapter_weight_scales_toy = 0.5
|
||||
adapter_weight_scales_pixel = {
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.load_lora_weights(
|
||||
"ostris/super-cereal-sdxl-lora",
|
||||
weight_name="cereal_box_sdxl_v1.safetensors",
|
||||
adapter_name="cereal"
|
||||
)
|
||||
scales = {
|
||||
"text_encoder": 0.5,
|
||||
"text_encoder_2": 0.5,
|
||||
"unet": {
|
||||
"down": 0.9, # all transformers in the down-part will use scale 0.9
|
||||
# "mid" # because, in this example, "mid" is not given, all transformers in the mid part will use the default scale 1.0
|
||||
"down": 0.9,
|
||||
"up": {
|
||||
"block_0": 0.6, # all 3 transformers in the 0th block in the up-part will use scale 0.6
|
||||
"block_1": [0.4, 0.8, 1.0], # the 3 transformers in the 1st block in the up-part will use scales 0.4, 0.8 and 1.0 respectively
|
||||
"block_0": 0.6,
|
||||
"block_1": [0.4, 0.8, 1.0],
|
||||
}
|
||||
}
|
||||
}
|
||||
pipe.set_adapters(["toy", "pixel"], [adapter_weight_scales_toy, adapter_weight_scales_pixel])
|
||||
image = pipe(prompt, num_inference_steps=30, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
pipeline.set_adapters("cereal", scales)
|
||||
pipeline("bears, pizza bites").images[0]
|
||||
```
|
||||
|
||||

|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Manage adapters
|
||||
## Hotswapping
|
||||
|
||||
You have attached multiple adapters in this tutorial, and if you're feeling a bit lost on what adapters have been attached to the pipeline's components, use the [`~diffusers.loaders.StableDiffusionLoraLoaderMixin.get_active_adapters`] method to check the list of active adapters:
|
||||
Hotswapping LoRAs is an efficient way to work with multiple LoRAs while avoiding accumulating memory from multiple calls to [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] and in some cases, recompilation, if a model is compiled. This workflow requires a loaded LoRA because the new LoRA weights are swapped in place for the existing loaded LoRA.
|
||||
|
||||
```py
|
||||
active_adapters = pipe.get_active_adapters()
|
||||
active_adapters
|
||||
["toy", "pixel"]
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
# load base model and LoRAs
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.load_lora_weights(
|
||||
"ostris/ikea-instructions-lora-sdxl",
|
||||
weight_name="ikea_instructions_xl_v1_5.safetensors",
|
||||
adapter_name="ikea"
|
||||
)
|
||||
```
|
||||
|
||||
You can also get the active adapters of each pipeline component with [`~diffusers.loaders.StableDiffusionLoraLoaderMixin.get_list_adapters`]:
|
||||
> [!WARNING]
|
||||
> Hotswapping is unsupported for LoRAs that target the text encoder.
|
||||
|
||||
Set `hotswap=True` in [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] to swap the second LoRA. Use the `adapter_name` parameter to indicate which LoRA to swap (`default_0` is the default name).
|
||||
|
||||
```py
|
||||
list_adapters_component_wise = pipe.get_list_adapters()
|
||||
list_adapters_component_wise
|
||||
{"text_encoder": ["toy", "pixel"], "unet": ["toy", "pixel"], "text_encoder_2": ["toy", "pixel"]}
|
||||
pipeline.load_lora_weights(
|
||||
"lordjia/by-feng-zikai",
|
||||
hotswap=True,
|
||||
adapter_name="ikea"
|
||||
)
|
||||
```
|
||||
|
||||
The [`~loaders.peft.PeftAdapterMixin.delete_adapters`] function completely removes an adapter and their LoRA layers from a model.
|
||||
### Compiled models
|
||||
|
||||
For compiled models, use [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`] to avoid recompilation when hotswapping LoRAs. This method should be called *before* loading the first LoRA and `torch.compile` should be called *after* loading the first LoRA.
|
||||
|
||||
> [!TIP]
|
||||
> The [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`] method isn't always necessary if the second LoRA targets the identical LoRA ranks and scales as the first LoRA.
|
||||
|
||||
Within [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`], the `target_rank` parameter is important for setting the rank for all LoRA adapters. Setting it to `max_rank` sets it to the highest value. For LoRAs with different ranks, you set it to a higher rank value. The default rank value is 128.
|
||||
|
||||
```py
|
||||
pipe.delete_adapters("toy")
|
||||
pipe.get_active_adapters()
|
||||
["pixel"]
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
# load base model and LoRAs
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
# 1. enable_lora_hotswap
|
||||
pipeline.enable_lora_hotswap(target_rank=max_rank)
|
||||
pipeline.load_lora_weights(
|
||||
"ostris/ikea-instructions-lora-sdxl",
|
||||
weight_name="ikea_instructions_xl_v1_5.safetensors",
|
||||
adapter_name="ikea"
|
||||
)
|
||||
# 2. torch.compile
|
||||
pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
# 3. hotswap
|
||||
pipeline.load_lora_weights(
|
||||
"lordjia/by-feng-zikai",
|
||||
hotswap=True,
|
||||
adapter_name="ikea"
|
||||
)
|
||||
```
|
||||
|
||||
## PeftInputAutocastDisableHook
|
||||
> [!TIP]
|
||||
> Move your code inside the `with torch._dynamo.config.patch(error_on_recompile=True)` context manager to detect if a model was recompiled. If a model is recompiled despite following all the steps above, please open an [issue](https://github.com/huggingface/diffusers/issues) with a reproducible example.
|
||||
|
||||
[[autodoc]] hooks.layerwise_casting.PeftInputAutocastDisableHook
|
||||
There are still scenarios where recompulation is unavoidable, such as when the hotswapped LoRA targets more layers than the initial adapter. Try to load the LoRA that targets the most layers *first*. For more details about this limitation, refer to the PEFT [hotswapping](https://huggingface.co/docs/peft/main/en/package_reference/hotswap#peft.utils.hotswap.hotswap_adapter) docs.
|
||||
|
||||
## Merge
|
||||
|
||||
The weights from each LoRA can be merged together to produce a blend of multiple existing styles. There are several methods for merging LoRAs, each of which differ in *how* the weights are merged (may affect generation quality).
|
||||
|
||||
### set_adapters
|
||||
|
||||
The [`~loaders.PeftAdapterMixin.set_adapters`] method merges LoRAs by concatenating their weighted matrices. Pass the LoRA names to [`~loaders.PeftAdapterMixin.set_adapters`] and use the `adapter_weights` parameter to control the scaling of each LoRA. For example, if `adapter_weights=[0.5, 0.5]`, the output is an average of both LoRAs.
|
||||
|
||||
> [!TIP]
|
||||
> The `"scale"` parameter determines how much of the merged LoRA to apply. See the [Weight scale](#weight-scale) section for more details.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.load_lora_weights(
|
||||
"ostris/ikea-instructions-lora-sdxl",
|
||||
weight_name="ikea_instructions_xl_v1_5.safetensors",
|
||||
adapter_name="ikea"
|
||||
)
|
||||
pipeline.load_lora_weights(
|
||||
"lordjia/by-feng-zikai",
|
||||
weight_name="fengzikai_v1.0_XL.safetensors",
|
||||
adapter_name="feng"
|
||||
)
|
||||
pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
|
||||
# use by Feng Zikai to activate the lordjia/by-feng-zikai LoRA
|
||||
pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai", cross_attention_kwargs={"scale": 1.0}).images[0]
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lora_merge_set_adapters.png"/>
|
||||
</div>
|
||||
|
||||
### add_weighted_adapter
|
||||
|
||||
> [!TIP]
|
||||
> This is an experimental method and you can refer to PEFTs [Model merging](https://huggingface.co/docs/peft/developer_guides/model_merging) for more details. Take a look at this [issue](https://github.com/huggingface/diffusers/issues/6892) if you're interested in the motivation and design behind this integration.
|
||||
|
||||
The [`~peft.LoraModel.add_weighted_adapter`] method enables more efficient merging methods like [TIES](https://huggingface.co/papers/2306.01708) or [DARE](https://huggingface.co/papers/2311.03099). These merging methods remove redundant and potentially interfering parameters from merged models. Keep in mind the LoRA ranks need to have identical ranks to be merged.
|
||||
|
||||
Make sure the latest stable version of Diffusers and PEFT is installed.
|
||||
|
||||
```bash
|
||||
pip install -U -q diffusers peft
|
||||
```
|
||||
|
||||
Load a UNET that corresponds to the LoRA UNet.
|
||||
|
||||
```py
|
||||
import copy
|
||||
import torch
|
||||
from diffusers import AutoModel, DiffusionPipeline
|
||||
from peft import get_peft_model, LoraConfig, PeftModel
|
||||
|
||||
unet = AutoModel.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
variant="fp16",
|
||||
subfolder="unet",
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
Load a pipeline, pass the UNet to it, and load a LoRA.
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
variant="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
unet=unet
|
||||
).to("cuda")
|
||||
pipeline.load_lora_weights(
|
||||
"ostris/ikea-instructions-lora-sdxl",
|
||||
weight_name="ikea_instructions_xl_v1_5.safetensors",
|
||||
adapter_name="ikea"
|
||||
)
|
||||
```
|
||||
|
||||
Create a [`~peft.PeftModel`] from the LoRA checkpoint by combining the first UNet you loaded and the LoRA UNet from the pipeline.
|
||||
|
||||
```py
|
||||
sdxl_unet = copy.deepcopy(unet)
|
||||
ikea_peft_model = get_peft_model(
|
||||
sdxl_unet,
|
||||
pipeline.unet.peft_config["ikea"],
|
||||
adapter_name="ikea"
|
||||
)
|
||||
|
||||
original_state_dict = {f"base_model.model.{k}": v for k, v in pipeline.unet.state_dict().items()}
|
||||
ikea_peft_model.load_state_dict(original_state_dict, strict=True)
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> You can save and reuse the `ikea_peft_model` by pushing it to the Hub as shown below.
|
||||
> ```py
|
||||
> ikea_peft_model.push_to_hub("ikea_peft_model", token=TOKEN)
|
||||
> ```
|
||||
|
||||
Repeat this process and create a [`~peft.PeftModel`] for the second LoRA.
|
||||
|
||||
```py
|
||||
pipeline.delete_adapters("ikea")
|
||||
sdxl_unet.delete_adapters("ikea")
|
||||
|
||||
pipeline.load_lora_weights(
|
||||
"lordjia/by-feng-zikai",
|
||||
weight_name="fengzikai_v1.0_XL.safetensors",
|
||||
adapter_name="feng"
|
||||
)
|
||||
pipeline.set_adapters(adapter_names="feng")
|
||||
|
||||
feng_peft_model = get_peft_model(
|
||||
sdxl_unet,
|
||||
pipeline.unet.peft_config["feng"],
|
||||
adapter_name="feng"
|
||||
)
|
||||
|
||||
original_state_dict = {f"base_model.model.{k}": v for k, v in pipe.unet.state_dict().items()}
|
||||
feng_peft_model.load_state_dict(original_state_dict, strict=True)
|
||||
```
|
||||
|
||||
Load a base UNet model and load the adapters.
|
||||
|
||||
```py
|
||||
base_unet = AutoModel.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
variant="fp16",
|
||||
subfolder="unet",
|
||||
).to("cuda")
|
||||
|
||||
model = PeftModel.from_pretrained(
|
||||
base_unet,
|
||||
"stevhliu/ikea_peft_model",
|
||||
use_safetensors=True,
|
||||
subfolder="ikea",
|
||||
adapter_name="ikea"
|
||||
)
|
||||
model.load_adapter(
|
||||
"stevhliu/feng_peft_model",
|
||||
use_safetensors=True,
|
||||
subfolder="feng",
|
||||
adapter_name="feng"
|
||||
)
|
||||
```
|
||||
|
||||
Merge the LoRAs with [`~peft.LoraModel.add_weighted_adapter`] and specify how you want to merge them with `combination_type`. The example below uses the `"dare_linear"` method (refer to this [blog post](https://huggingface.co/blog/peft_merging) to learn more about these merging methods), which randomly prunes some weights and then performs a weighted sum of the tensors based on the set weightage of each LoRA in `weights`.
|
||||
|
||||
Activate the merged LoRAs with [`~loaders.PeftAdapterMixin.set_adapters`].
|
||||
|
||||
```py
|
||||
model.add_weighted_adapter(
|
||||
adapters=["ikea", "feng"],
|
||||
combination_type="dare_linear",
|
||||
weights=[1.0, 1.0],
|
||||
adapter_name="ikea-feng"
|
||||
)
|
||||
model.set_adapters("ikea-feng")
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
unet=model,
|
||||
variant="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai").images[0]
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ikea-feng-dare-linear.png"/>
|
||||
</div>
|
||||
|
||||
### fuse_lora
|
||||
|
||||
The [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] method fuses the LoRA weights directly with the original UNet and text encoder weights of the underlying model. This reduces the overhead of loading the underlying model for each LoRA because it only loads the model once, which lowers memory usage and increases inference speed.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.load_lora_weights(
|
||||
"ostris/ikea-instructions-lora-sdxl",
|
||||
weight_name="ikea_instructions_xl_v1_5.safetensors",
|
||||
adapter_name="ikea"
|
||||
)
|
||||
pipeline.load_lora_weights(
|
||||
"lordjia/by-feng-zikai",
|
||||
weight_name="fengzikai_v1.0_XL.safetensors",
|
||||
adapter_name="feng"
|
||||
)
|
||||
pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
|
||||
```
|
||||
|
||||
Call [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] to fuse them. The `lora_scale` parameter controls how much to scale the output by with the LoRA weights. It is important to make this adjustment now because passing `scale` to `cross_attention_kwargs` won't work in the pipeline.
|
||||
|
||||
```py
|
||||
pipeline.fuse_lora(adapter_names=["ikea", "feng"], lora_scale=1.0)
|
||||
```
|
||||
|
||||
Unload the LoRA weights since they're already fused with the underlying model. Save the fused pipeline with either [`~DiffusionPipeline.save_pretrained`] to save it locally or [`~PushToHubMixin.push_to_hub`] to save it to the Hub.
|
||||
|
||||
<hfoptions id="save">
|
||||
<hfoption id="save locally">
|
||||
|
||||
```py
|
||||
pipeline.unload_lora_weights()
|
||||
pipeline.save_pretrained("path/to/fused-pipeline")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="save to Hub">
|
||||
|
||||
```py
|
||||
pipeline.unload_lora_weights()
|
||||
pipeline.push_to_hub("fused-ikea-feng")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
The fused pipeline can now be quickly loaded for inference without requiring each LoRA to be separately loaded.
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"username/fused-ikea-feng", torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai").images[0]
|
||||
```
|
||||
|
||||
Use [`~loaders.LoraLoaderMixin.unfuse_lora`] to restore the underlying models weights, for example, if you want to use a different `lora_scale` value. You can only unfuse if there is a single LoRA fused. For example, it won't work with the pipeline from above because there are multiple fused LoRAs. In these cases, you'll need to reload the entire model.
|
||||
|
||||
```py
|
||||
pipeline.unfuse_lora()
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/fuse_lora.png"/>
|
||||
</div>
|
||||
|
||||
## Manage
|
||||
|
||||
Diffusers provides several methods to help you manage working with LoRAs. These methods can be especially useful if you're working with multiple LoRAs.
|
||||
|
||||
### set_adapters
|
||||
|
||||
[`~loaders.PeftAdapterMixin.set_adapters`] also activates the current LoRA to use if there are multiple active LoRAs. This allows you to switch between different LoRAs by specifying their name.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.load_lora_weights(
|
||||
"ostris/ikea-instructions-lora-sdxl",
|
||||
weight_name="ikea_instructions_xl_v1_5.safetensors",
|
||||
adapter_name="ikea"
|
||||
)
|
||||
pipeline.load_lora_weights(
|
||||
"lordjia/by-feng-zikai",
|
||||
weight_name="fengzikai_v1.0_XL.safetensors",
|
||||
adapter_name="feng"
|
||||
)
|
||||
# activates the feng LoRA instead of the ikea LoRA
|
||||
pipeline.set_adapters("feng")
|
||||
```
|
||||
|
||||
### save_lora_adapter
|
||||
|
||||
Save an adapter with [`~loaders.PeftAdapterMixin.save_lora_adapter`].
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.unet.load_lora_adapter(
|
||||
"jbilcke-hf/sdxl-cinematic-1",
|
||||
weight_name="pytorch_lora_weights.safetensors",
|
||||
adapter_name="cinematic"
|
||||
prefix="unet"
|
||||
)
|
||||
pipeline.save_lora_adapter("path/to/save", adapter_name="cinematic")
|
||||
```
|
||||
|
||||
### unload_lora_weights
|
||||
|
||||
The [`~loaders.lora_base.LoraBaseMixin.unload_lora_weights`] method unloads any LoRA weights in the pipeline to restore the underlying model weights.
|
||||
|
||||
```py
|
||||
pipeline.unload_lora_weights()
|
||||
```
|
||||
|
||||
### disable_lora
|
||||
|
||||
The [`~loaders.PeftAdapterMixin.disable_lora`] method disables all LoRAs (but they're still kept on the pipeline) and restores the pipeline to the underlying model weights.
|
||||
|
||||
```py
|
||||
pipeline.disable_lora()
|
||||
```
|
||||
|
||||
### get_active_adapters
|
||||
|
||||
The [`~loaders.lora_base.LoraBaseMixin.get_active_adapters`] method returns a list of active LoRAs attached to a pipeline.
|
||||
|
||||
```py
|
||||
pipeline.get_active_adapters()
|
||||
["cereal", "ikea"]
|
||||
```
|
||||
|
||||
### get_list_adapters
|
||||
|
||||
The [`~loaders.lora_base.LoraBaseMixin.get_list_adapters`] method returns the active LoRAs for each component in the pipeline.
|
||||
|
||||
```py
|
||||
pipeline.get_list_adapters()
|
||||
{"unet": ["cereal", "ikea"], "text_encoder_2": ["cereal"]}
|
||||
```
|
||||
|
||||
### delete_adapters
|
||||
|
||||
The [`~loaders.PeftAdapterMixin.delete_adapters`] method completely removes a LoRA and its layers from a model.
|
||||
|
||||
```py
|
||||
pipeline.delete_adapters("ikea")
|
||||
```
|
||||
|
||||
## Resources
|
||||
|
||||
Browse the [LoRA Studio](https://lorastudio.co/models) for different LoRAs to use or you can upload your favorite LoRAs from Civitai to the Hub with the Space below.
|
||||
|
||||
<iframe
|
||||
src="https://multimodalart-civitai-to-hf.hf.space"
|
||||
frameborder="0"
|
||||
width="850"
|
||||
height="450"
|
||||
></iframe>
|
||||
|
||||
You can find additional LoRAs in the [FLUX LoRA the Explorer](https://huggingface.co/spaces/multimodalart/flux-lora-the-explorer) and [LoRA the Explorer](https://huggingface.co/spaces/multimodalart/LoraTheExplorer) Spaces.
|
||||
@@ -12,46 +12,28 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# ControlNet
|
||||
|
||||
ControlNet is a type of model for controlling image diffusion models by conditioning the model with an additional input image. There are many types of conditioning inputs (canny edge, user sketching, human pose, depth, and more) you can use to control a diffusion model. This is hugely useful because it affords you greater control over image generation, making it easier to generate specific images without experimenting with different text prompts or denoising values as much.
|
||||
[ControlNet](https://huggingface.co/papers/2302.05543) is an adapter that enables controllable generation such as generating an image of a cat in a *specific pose* or following the lines in a sketch of a *specific* cat. It works by adding a smaller network of "zero convolution" layers and progressively training these to avoid disrupting with the original model. The original model parameters are frozen to avoid retraining it.
|
||||
|
||||
<Tip>
|
||||
A ControlNet is conditioned on extra visual information or "structural controls" (canny edge, depth maps, human pose, etc.) that can be combined with text prompts to generate images that are guided by the visual input.
|
||||
|
||||
Check out Section 3.5 of the [ControlNet](https://huggingface.co/papers/2302.05543) paper v1 for a list of ControlNet implementations on various conditioning inputs. You can find the official Stable Diffusion ControlNet conditioned models on [lllyasviel](https://huggingface.co/lllyasviel)'s Hub profile, and more [community-trained](https://huggingface.co/models?other=stable-diffusion&other=controlnet) ones on the Hub.
|
||||
> [!TIP]
|
||||
> ControlNets are available to many models such as [Flux](../api/pipelines/controlnet_flux), [Hunyuan-DiT](../api/pipelines/controlnet_hunyuandit), [Stable Diffusion 3](../api/pipelines/controlnet_sd3), and more. The examples in this guide use Flux and Stable Diffusion XL.
|
||||
|
||||
For Stable Diffusion XL (SDXL) ControlNet models, you can find them on the 🤗 [Diffusers](https://huggingface.co/diffusers) Hub organization, or you can browse [community-trained](https://huggingface.co/models?other=stable-diffusion-xl&other=controlnet) ones on the Hub.
|
||||
Load a ControlNet conditioned on a specific control, such as canny edge, and pass it to the pipeline in [`~DiffusionPipeline.from_pretrained`].
|
||||
|
||||
</Tip>
|
||||
<hfoptions id="usage">
|
||||
<hfoption id="text-to-image">
|
||||
|
||||
A ControlNet model has two sets of weights (or blocks) connected by a zero-convolution layer:
|
||||
|
||||
- a *locked copy* keeps everything a large pretrained diffusion model has learned
|
||||
- a *trainable copy* is trained on the additional conditioning input
|
||||
|
||||
Since the locked copy preserves the pretrained model, training and implementing a ControlNet on a new conditioning input is as fast as finetuning any other model because you aren't training the model from scratch.
|
||||
|
||||
This guide will show you how to use ControlNet for text-to-image, image-to-image, inpainting, and more! There are many types of ControlNet conditioning inputs to choose from, but in this guide we'll only focus on several of them. Feel free to experiment with other conditioning inputs!
|
||||
|
||||
Before you begin, make sure you have the following libraries installed:
|
||||
Generate a canny image with [opencv-python](https://github.com/opencv/opencv-python).
|
||||
|
||||
```py
|
||||
# uncomment to install the necessary libraries in Colab
|
||||
#!pip install -q diffusers transformers accelerate opencv-python
|
||||
```
|
||||
|
||||
## Text-to-image
|
||||
|
||||
For text-to-image, you normally pass a text prompt to the model. But with ControlNet, you can specify an additional conditioning input. Let's condition the model with a canny image, a white outline of an image on a black background. This way, the ControlNet can use the canny image as a control to guide the model to generate an image with the same outline.
|
||||
|
||||
Load an image and use the [opencv-python](https://github.com/opencv/opencv-python) library to extract the canny image:
|
||||
|
||||
```py
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
from PIL import Image
|
||||
import cv2
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from diffusers.utils import load_image
|
||||
|
||||
original_image = load_image(
|
||||
"https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png"
|
||||
)
|
||||
|
||||
image = np.array(original_image)
|
||||
@@ -65,523 +47,300 @@ image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/vermeer_canny_edged.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">canny image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Next, load a ControlNet model conditioned on canny edge detection and pass it to the [`StableDiffusionControlNetPipeline`]. Use the faster [`UniPCMultistepScheduler`] and enable model offloading to speed up inference and reduce memory usage.
|
||||
Pass the canny image to the pipeline. Use the `controlnet_conditioning_scale` parameter to determine how much weight to assign to the control.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel, UniPCMultistepScheduler
|
||||
import torch
|
||||
from diffusers.utils import load_image
|
||||
from diffusers import FluxControlNetPipeline, FluxControlNetModel
|
||||
|
||||
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", torch_dtype=torch.float16, use_safetensors=True)
|
||||
pipe = StableDiffusionControlNetPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
|
||||
controlnet = FluxControlNetModel.from_pretrained(
|
||||
"InstantX/FLUX.1-dev-Controlnet-Canny", torch_dtype=torch.bfloat16
|
||||
)
|
||||
pipeline = FluxControlNetPipeline.from_pretrained(
|
||||
"black-forest-labs/FLUX.1-dev", controlnet=controlnet, torch_dtype=torch.bfloat16
|
||||
).to("cuda")
|
||||
|
||||
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
pipe.enable_model_cpu_offload()
|
||||
```
|
||||
prompt = """
|
||||
A photorealistic overhead image of a cat reclining sideways in a flamingo pool floatie holding a margarita.
|
||||
The cat is floating leisurely in the pool and completely relaxed and happy.
|
||||
"""
|
||||
|
||||
Now pass your prompt and canny image to the pipeline:
|
||||
|
||||
```py
|
||||
output = pipe(
|
||||
"the mona lisa", image=canny_image
|
||||
pipeline(
|
||||
prompt,
|
||||
control_image=canny_image,
|
||||
controlnet_conditioning_scale=0.5,
|
||||
num_inference_steps=50,
|
||||
guidance_scale=3.5,
|
||||
).images[0]
|
||||
make_image_grid([original_image, canny_image, output], rows=1, cols=3)
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-text2img.png"/>
|
||||
<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png" width="300" alt="Generated image (prompt only)"/>
|
||||
<figcaption style="text-align: center;">original image</figcaption>
|
||||
</figure>
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png" width="300" alt="Control image (Canny edges)"/>
|
||||
<figcaption style="text-align: center;">canny image</figcaption>
|
||||
</figure>
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat-generated.png" width="300" alt="Generated image (ControlNet + prompt)"/>
|
||||
<figcaption style="text-align: center;">generated image</figcaption>
|
||||
</figure>
|
||||
</div>
|
||||
|
||||
## Image-to-image
|
||||
|
||||
For image-to-image, you'd typically pass an initial image and a prompt to the pipeline to generate a new image. With ControlNet, you can pass an additional conditioning input to guide the model. Let's condition the model with a depth map, an image which contains spatial information. This way, the ControlNet can use the depth map as a control to guide the model to generate an image that preserves spatial information.
|
||||
</hfoption>
|
||||
<hfoption id="image-to-image">
|
||||
|
||||
You'll use the [`StableDiffusionControlNetImg2ImgPipeline`] for this task, which is different from the [`StableDiffusionControlNetPipeline`] because it allows you to pass an initial image as the starting point for the image generation process.
|
||||
|
||||
Load an image and use the `depth-estimation` [`~transformers.Pipeline`] from 🤗 Transformers to extract the depth map of an image:
|
||||
Generate a depth map with a depth estimation pipeline from Transformers.
|
||||
|
||||
```py
|
||||
import torch
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from transformers import DPTImageProcessor, DPTForDepthEstimation
|
||||
from diffusers import ControlNetModel, StableDiffusionXLControlNetImg2ImgPipeline, AutoencoderKL
|
||||
from diffusers.utils import load_image
|
||||
|
||||
from transformers import pipeline
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-img2img.jpg"
|
||||
)
|
||||
depth_estimator = DPTForDepthEstimation.from_pretrained("Intel/dpt-hybrid-midas").to("cuda")
|
||||
feature_extractor = DPTImageProcessor.from_pretrained("Intel/dpt-hybrid-midas")
|
||||
|
||||
def get_depth_map(image, depth_estimator):
|
||||
image = depth_estimator(image)["depth"]
|
||||
image = np.array(image)
|
||||
image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
detected_map = torch.from_numpy(image).float() / 255.0
|
||||
depth_map = detected_map.permute(2, 0, 1)
|
||||
return depth_map
|
||||
def get_depth_map(image):
|
||||
image = feature_extractor(images=image, return_tensors="pt").pixel_values.to("cuda")
|
||||
with torch.no_grad(), torch.autocast("cuda"):
|
||||
depth_map = depth_estimator(image).predicted_depth
|
||||
|
||||
depth_estimator = pipeline("depth-estimation")
|
||||
depth_map = get_depth_map(image, depth_estimator).unsqueeze(0).half().to("cuda")
|
||||
```
|
||||
|
||||
Next, load a ControlNet model conditioned on depth maps and pass it to the [`StableDiffusionControlNetImg2ImgPipeline`]. Use the faster [`UniPCMultistepScheduler`] and enable model offloading to speed up inference and reduce memory usage.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionControlNetImg2ImgPipeline, ControlNetModel, UniPCMultistepScheduler
|
||||
import torch
|
||||
|
||||
controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11f1p_sd15_depth", torch_dtype=torch.float16, use_safetensors=True)
|
||||
pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
|
||||
)
|
||||
|
||||
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
pipe.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
Now pass your prompt, initial image, and depth map to the pipeline:
|
||||
|
||||
```py
|
||||
output = pipe(
|
||||
"lego batman and robin", image=image, control_image=depth_map,
|
||||
).images[0]
|
||||
make_image_grid([image, output], rows=1, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-img2img.jpg"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-img2img-2.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Inpainting
|
||||
|
||||
For inpainting, you need an initial image, a mask image, and a prompt describing what to replace the mask with. ControlNet models allow you to add another control image to condition a model with. Let’s condition the model with an inpainting mask. This way, the ControlNet can use the inpainting mask as a control to guide the model to generate an image within the mask area.
|
||||
|
||||
Load an initial image and a mask image:
|
||||
|
||||
```py
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
|
||||
init_image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint.jpg"
|
||||
)
|
||||
init_image = init_image.resize((512, 512))
|
||||
|
||||
mask_image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint-mask.jpg"
|
||||
)
|
||||
mask_image = mask_image.resize((512, 512))
|
||||
make_image_grid([init_image, mask_image], rows=1, cols=2)
|
||||
```
|
||||
|
||||
Create a function to prepare the control image from the initial and mask images. This'll create a tensor to mark the pixels in `init_image` as masked if the corresponding pixel in `mask_image` is over a certain threshold.
|
||||
|
||||
```py
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
def make_inpaint_condition(image, image_mask):
|
||||
image = np.array(image.convert("RGB")).astype(np.float32) / 255.0
|
||||
image_mask = np.array(image_mask.convert("L")).astype(np.float32) / 255.0
|
||||
|
||||
assert image.shape[0:1] == image_mask.shape[0:1]
|
||||
image[image_mask > 0.5] = -1.0 # set as masked pixel
|
||||
image = np.expand_dims(image, 0).transpose(0, 3, 1, 2)
|
||||
image = torch.from_numpy(image)
|
||||
depth_map = torch.nn.functional.interpolate(
|
||||
depth_map.unsqueeze(1),
|
||||
size=(1024, 1024),
|
||||
mode="bicubic",
|
||||
align_corners=False,
|
||||
)
|
||||
depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
|
||||
depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
|
||||
depth_map = (depth_map - depth_min) / (depth_max - depth_min)
|
||||
image = torch.cat([depth_map] * 3, dim=1)
|
||||
image = image.permute(0, 2, 3, 1).cpu().numpy()[0]
|
||||
image = Image.fromarray((image * 255.0).clip(0, 255).astype(np.uint8))
|
||||
return image
|
||||
|
||||
control_image = make_inpaint_condition(init_image, mask_image)
|
||||
depth_image = get_depth_map(image)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint.jpg"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint-mask.jpg"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">mask image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Load a ControlNet model conditioned on inpainting and pass it to the [`StableDiffusionControlNetInpaintPipeline`]. Use the faster [`UniPCMultistepScheduler`] and enable model offloading to speed up inference and reduce memory usage.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, UniPCMultistepScheduler
|
||||
|
||||
controlnet = ControlNetModel.from_pretrained("lllyasviel/control_v11p_sd15_inpaint", torch_dtype=torch.float16, use_safetensors=True)
|
||||
pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, torch_dtype=torch.float16, use_safetensors=True
|
||||
)
|
||||
|
||||
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
pipe.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
Now pass your prompt, initial image, mask image, and control image to the pipeline:
|
||||
|
||||
```py
|
||||
output = pipe(
|
||||
"corgi face with large ears, detailed, pixar, animated, disney",
|
||||
num_inference_steps=20,
|
||||
eta=1.0,
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
control_image=control_image,
|
||||
).images[0]
|
||||
make_image_grid([init_image, mask_image, output], rows=1, cols=3)
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/controlnet-inpaint-result.png"/>
|
||||
</div>
|
||||
|
||||
## Guess mode
|
||||
|
||||
[Guess mode](https://github.com/lllyasviel/ControlNet/discussions/188) does not require supplying a prompt to a ControlNet at all! This forces the ControlNet encoder to do its best to "guess" the contents of the input control map (depth map, pose estimation, canny edge, etc.).
|
||||
|
||||
Guess mode adjusts the scale of the output residuals from a ControlNet by a fixed ratio depending on the block depth. The shallowest `DownBlock` corresponds to 0.1, and as the blocks get deeper, the scale increases exponentially such that the scale of the `MidBlock` output becomes 1.0.
|
||||
|
||||
<Tip>
|
||||
|
||||
Guess mode does not have any impact on prompt conditioning and you can still provide a prompt if you want.
|
||||
|
||||
</Tip>
|
||||
|
||||
Set `guess_mode=True` in the pipeline, and it is [recommended](https://github.com/lllyasviel/ControlNet#guess-mode--non-prompt-mode) to set the `guidance_scale` value between 3.0 and 5.0.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionControlNetPipeline, ControlNetModel
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image
|
||||
import cv2
|
||||
|
||||
controlnet = ControlNetModel.from_pretrained("lllyasviel/sd-controlnet-canny", use_safetensors=True)
|
||||
pipe = StableDiffusionControlNetPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", controlnet=controlnet, use_safetensors=True).to("cuda")
|
||||
|
||||
original_image = load_image("https://huggingface.co/takuma104/controlnet_dev/resolve/main/bird_512x512.png")
|
||||
|
||||
image = np.array(original_image)
|
||||
|
||||
low_threshold = 100
|
||||
high_threshold = 200
|
||||
|
||||
image = cv2.Canny(image, low_threshold, high_threshold)
|
||||
image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image)
|
||||
|
||||
image = pipe("", image=canny_image, guess_mode=True, guidance_scale=3.0).images[0]
|
||||
make_image_grid([original_image, canny_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare_guess_mode/output_images/diffusers/output_bird_canny_0.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">regular mode with prompt</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/takuma104/controlnet_dev/resolve/main/gen_compare_guess_mode/output_images/diffusers/output_bird_canny_0_gm.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">guess mode without prompt</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## ControlNet with Stable Diffusion XL
|
||||
|
||||
There aren't too many ControlNet models compatible with Stable Diffusion XL (SDXL) at the moment, but we've trained two full-sized ControlNet models for SDXL conditioned on canny edge detection and depth maps. We're also experimenting with creating smaller versions of these SDXL-compatible ControlNet models so it is easier to run on resource-constrained hardware. You can find these checkpoints on the [🤗 Diffusers Hub organization](https://huggingface.co/diffusers)!
|
||||
|
||||
Let's use a SDXL ControlNet conditioned on canny images to generate an image. Start by loading an image and prepare the canny image:
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
from PIL import Image
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
original_image = load_image(
|
||||
"https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
|
||||
)
|
||||
|
||||
image = np.array(original_image)
|
||||
|
||||
low_threshold = 100
|
||||
high_threshold = 200
|
||||
|
||||
image = cv2.Canny(image, low_threshold, high_threshold)
|
||||
image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image)
|
||||
make_image_grid([original_image, canny_image], rows=1, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hf-logo-canny.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">canny image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Load a SDXL ControlNet model conditioned on canny edge detection and pass it to the [`StableDiffusionXLControlNetPipeline`]. You can also enable model offloading to reduce memory usage.
|
||||
Pass the depth map to the pipeline. Use the `controlnet_conditioning_scale` parameter to determine how much weight to assign to the control.
|
||||
|
||||
```py
|
||||
controlnet = ControlNetModel.from_pretrained(
|
||||
"diffusers/controlnet-canny-sdxl-1.0",
|
||||
"diffusers/controlnet-depth-sdxl-1.0-small",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True
|
||||
)
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
|
||||
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
|
||||
pipeline = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
controlnet=controlnet,
|
||||
vae=vae,
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True
|
||||
)
|
||||
pipe.enable_model_cpu_offload()
|
||||
```
|
||||
).to("cuda")
|
||||
|
||||
Now pass your prompt (and optionally a negative prompt if you're using one) and canny image to the pipeline:
|
||||
|
||||
<Tip>
|
||||
|
||||
The [`controlnet_conditioning_scale`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet#diffusers.StableDiffusionControlNetPipeline.__call__.controlnet_conditioning_scale) parameter determines how much weight to assign to the conditioning inputs. A value of 0.5 is recommended for good generalization, but feel free to experiment with this number!
|
||||
|
||||
</Tip>
|
||||
|
||||
```py
|
||||
prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
|
||||
negative_prompt = 'low quality, bad quality, sketches'
|
||||
|
||||
image = pipe(
|
||||
prompt = """
|
||||
A photorealistic overhead image of a cat reclining sideways in a flamingo pool floatie holding a margarita.
|
||||
The cat is floating leisurely in the pool and completely relaxed and happy.
|
||||
"""
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png"
|
||||
).resize((1024, 1024))
|
||||
controlnet_conditioning_scale = 0.5
|
||||
pipeline(
|
||||
prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
image=canny_image,
|
||||
controlnet_conditioning_scale=0.5,
|
||||
image=image,
|
||||
control_image=depth_image,
|
||||
controlnet_conditioning_scale=controlnet_conditioning_scale,
|
||||
strength=0.99,
|
||||
num_inference_steps=100,
|
||||
).images[0]
|
||||
make_image_grid([original_image, canny_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/diffusers/controlnet-canny-sdxl-1.0/resolve/main/out_hug_lab_7.png"/>
|
||||
<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png" width="300" alt="Generated image (prompt only)"/>
|
||||
<figcaption style="text-align: center;">original image</figcaption>
|
||||
</figure>
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_depth_image.png" width="300" alt="Control image (Canny edges)"/>
|
||||
<figcaption style="text-align: center;">depth map</figcaption>
|
||||
</figure>
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_depth_cat.png" width="300" alt="Generated image (ControlNet + prompt)"/>
|
||||
<figcaption style="text-align: center;">generated image</figcaption>
|
||||
</figure>
|
||||
</div>
|
||||
|
||||
You can use [`StableDiffusionXLControlNetPipeline`] in guess mode as well by setting the parameter to `True`:
|
||||
</hfoption>
|
||||
<hfoption id="inpainting">
|
||||
|
||||
Generate a mask image and convert it to a tensor to mark the pixels in the original image as masked if the corresponding pixel in the mask image is over a certain threshold.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
import numpy as np
|
||||
import torch
|
||||
import cv2
|
||||
import torch
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
from diffusers.utils import load_image
|
||||
from diffusers import StableDiffusionXLControlNetInpaintPipeline, ControlNetModel
|
||||
|
||||
prompt = "aerial view, a futuristic research complex in a bright foggy jungle, hard lighting"
|
||||
negative_prompt = "low quality, bad quality, sketches"
|
||||
|
||||
original_image = load_image(
|
||||
"https://hf.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png"
|
||||
init_image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png"
|
||||
)
|
||||
init_image = init_image.resize((1024, 1024))
|
||||
mask_image = load_image(
|
||||
"/content/cat_mask.png"
|
||||
)
|
||||
mask_image = mask_image.resize((1024, 1024))
|
||||
|
||||
def make_canny_condition(image):
|
||||
image = np.array(image)
|
||||
image = cv2.Canny(image, 100, 200)
|
||||
image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
image = Image.fromarray(image)
|
||||
return image
|
||||
|
||||
control_image = make_canny_condition(init_image)
|
||||
```
|
||||
|
||||
Pass the mask and control image to the pipeline. Use the `controlnet_conditioning_scale` parameter to determine how much weight to assign to the control.
|
||||
|
||||
```py
|
||||
controlnet = ControlNetModel.from_pretrained(
|
||||
"diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True
|
||||
"diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
|
||||
)
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
|
||||
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, vae=vae, torch_dtype=torch.float16, use_safetensors=True
|
||||
pipeline = StableDiffusionXLControlNetInpaintPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnet, torch_dtype=torch.float16
|
||||
)
|
||||
pipe.enable_model_cpu_offload()
|
||||
|
||||
image = np.array(original_image)
|
||||
image = cv2.Canny(image, 100, 200)
|
||||
image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image)
|
||||
|
||||
image = pipe(
|
||||
prompt, negative_prompt=negative_prompt, controlnet_conditioning_scale=0.5, image=canny_image, guess_mode=True,
|
||||
).images[0]
|
||||
make_image_grid([original_image, canny_image, image], rows=1, cols=3)
|
||||
```
|
||||
|
||||
<Tip>
|
||||
|
||||
You can use a refiner model with `StableDiffusionXLControlNetPipeline` to improve image quality, just like you can with a regular `StableDiffusionXLPipeline`.
|
||||
See the [Refine image quality](./sdxl#refine-image-quality) section to learn how to use the refiner model.
|
||||
Make sure to use `StableDiffusionXLControlNetPipeline` and pass `image` and `controlnet_conditioning_scale`.
|
||||
|
||||
```py
|
||||
base = StableDiffusionXLControlNetPipeline(...)
|
||||
image = base(
|
||||
prompt=prompt,
|
||||
pipeline(
|
||||
"a cute and fluffy bunny rabbit",
|
||||
num_inference_steps=100,
|
||||
strength=0.99,
|
||||
controlnet_conditioning_scale=0.5,
|
||||
image=canny_image,
|
||||
num_inference_steps=40,
|
||||
denoising_end=0.8,
|
||||
output_type="latent",
|
||||
).images
|
||||
# rest exactly as with StableDiffusionXLPipeline
|
||||
image=init_image,
|
||||
mask_image=mask_image,
|
||||
control_image=control_image,
|
||||
).images[0]
|
||||
```
|
||||
|
||||
</Tip>
|
||||
|
||||
## MultiControlNet
|
||||
|
||||
<Tip>
|
||||
|
||||
Replace the SDXL model with a model like [stable-diffusion-v1-5/stable-diffusion-v1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) to use multiple conditioning inputs with Stable Diffusion models.
|
||||
|
||||
</Tip>
|
||||
|
||||
You can compose multiple ControlNet conditionings from different image inputs to create a *MultiControlNet*. To get better results, it is often helpful to:
|
||||
|
||||
1. mask conditionings such that they don't overlap (for example, mask the area of a canny image where the pose conditioning is located)
|
||||
2. experiment with the [`controlnet_conditioning_scale`](https://huggingface.co/docs/diffusers/main/en/api/pipelines/controlnet#diffusers.StableDiffusionControlNetPipeline.__call__.controlnet_conditioning_scale) parameter to determine how much weight to assign to each conditioning input
|
||||
|
||||
In this example, you'll combine a canny image and a human pose estimation image to generate a new image.
|
||||
|
||||
Prepare the canny image conditioning:
|
||||
|
||||
```py
|
||||
from diffusers.utils import load_image, make_image_grid
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
import cv2
|
||||
|
||||
original_image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/landscape.png"
|
||||
)
|
||||
image = np.array(original_image)
|
||||
|
||||
low_threshold = 100
|
||||
high_threshold = 200
|
||||
|
||||
image = cv2.Canny(image, low_threshold, high_threshold)
|
||||
|
||||
# zero out middle columns of image where pose will be overlaid
|
||||
zero_start = image.shape[1] // 4
|
||||
zero_end = zero_start + image.shape[1] // 2
|
||||
image[:, zero_start:zero_end] = 0
|
||||
|
||||
image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image)
|
||||
make_image_grid([original_image, canny_image], rows=1, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/landscape.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/controlnet/landscape_canny_masked.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">canny image</figcaption>
|
||||
</div>
|
||||
<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png" width="300" alt="Generated image (prompt only)"/>
|
||||
<figcaption style="text-align: center;">original image</figcaption>
|
||||
</figure>
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/cat_mask.png" width="300" alt="Control image (Canny edges)"/>
|
||||
<figcaption style="text-align: center;">mask image</figcaption>
|
||||
</figure>
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_rabbit_inpaint.png" width="300" alt="Generated image (ControlNet + prompt)"/>
|
||||
<figcaption style="text-align: center;">generated image</figcaption>
|
||||
</figure>
|
||||
</div>
|
||||
|
||||
For human pose estimation, install [controlnet_aux](https://github.com/patrickvonplaten/controlnet_aux):
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Multi-ControlNet
|
||||
|
||||
You can compose multiple ControlNet conditionings, such as canny image and a depth map, to create a *MultiControlNet*. For the best rersults, you should mask conditionings so they don't overlap and experiment with different `controlnet_conditioning_scale` parameters to adjust how much weight is assigned to each control input.
|
||||
|
||||
The example below composes a canny image and depth map.
|
||||
|
||||
Pass the ControlNets as a list to the pipeline and resize the images to the expected input size.
|
||||
|
||||
```py
|
||||
# uncomment to install the necessary library in Colab
|
||||
#!pip install -q controlnet-aux
|
||||
```
|
||||
|
||||
Prepare the human pose estimation conditioning:
|
||||
|
||||
```py
|
||||
from controlnet_aux import OpenposeDetector
|
||||
|
||||
openpose = OpenposeDetector.from_pretrained("lllyasviel/ControlNet")
|
||||
original_image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/person.png"
|
||||
)
|
||||
openpose_image = openpose(original_image)
|
||||
make_image_grid([original_image, openpose_image], rows=1, cols=2)
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/person.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/blog/controlnet/person_pose.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">human pose image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Load a list of ControlNet models that correspond to each conditioning, and pass them to the [`StableDiffusionXLControlNetPipeline`]. Use the faster [`UniPCMultistepScheduler`] and enable model offloading to reduce memory usage.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL, UniPCMultistepScheduler
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel, AutoencoderKL
|
||||
|
||||
controlnets = [
|
||||
ControlNetModel.from_pretrained(
|
||||
"thibaud/controlnet-openpose-sdxl-1.0", torch_dtype=torch.float16
|
||||
"diffusers/controlnet-depth-sdxl-1.0-small", torch_dtype=torch.float16
|
||||
),
|
||||
ControlNetModel.from_pretrained(
|
||||
"diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16, use_safetensors=True
|
||||
"diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16,
|
||||
),
|
||||
]
|
||||
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
|
||||
pipe = StableDiffusionXLControlNetPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnets, vae=vae, torch_dtype=torch.float16, use_safetensors=True
|
||||
)
|
||||
pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
|
||||
pipe.enable_model_cpu_offload()
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
|
||||
pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", controlnet=controlnets, vae=vae, torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
prompt = """
|
||||
a relaxed rabbit sitting on a striped towel next to a pool with a tropical drink nearby,
|
||||
bright sunny day, vacation scene, 35mm photograph, film, professional, 4k, highly detailed
|
||||
"""
|
||||
negative_prompt = "lowres, bad anatomy, worst quality, low quality, deformed, ugly"
|
||||
|
||||
images = [canny_image.resize((1024, 1024)), depth_image.resize((1024, 1024))]
|
||||
|
||||
pipeline(
|
||||
prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
image=images,
|
||||
num_inference_steps=100,
|
||||
controlnet_conditioning_scale=[0.5, 0.5],
|
||||
strength=0.7,
|
||||
).images[0]
|
||||
```
|
||||
|
||||
Now you can pass your prompt (an optional negative prompt if you're using one), canny image, and pose image to the pipeline:
|
||||
<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png" width="300" alt="Generated image (prompt only)"/>
|
||||
<figcaption style="text-align: center;">canny image</figcaption>
|
||||
</figure>
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/multicontrolnet_depth.png" width="300" alt="Control image (Canny edges)"/>
|
||||
<figcaption style="text-align: center;">depth map</figcaption>
|
||||
</figure>
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_multi_controlnet.png" width="300" alt="Generated image (ControlNet + prompt)"/>
|
||||
<figcaption style="text-align: center;">generated image</figcaption>
|
||||
</figure>
|
||||
</div>
|
||||
|
||||
## guess_mode
|
||||
|
||||
[Guess mode](https://github.com/lllyasviel/ControlNet/discussions/188) generates an image from **only** the control input (canny edge, depth map, pose, etc.) and without guidance from a prompt. It adjusts the scale of the ControlNet's output residuals by a fixed ratio depending on block depth. The earlier `DownBlock` is only scaled by `0.1` and the `MidBlock` is fully scaled by `1.0`.
|
||||
|
||||
```py
|
||||
prompt = "a giant standing in a fantasy landscape, best quality"
|
||||
negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
|
||||
import torch
|
||||
from diffusers.utils import load_iamge
|
||||
from diffusers import StableDiffusionXLControlNetPipeline, ControlNetModel
|
||||
|
||||
generator = torch.manual_seed(1)
|
||||
controlnet = ControlNetModel.from_pretrained(
|
||||
"diffusers/controlnet-canny-sdxl-1.0", torch_dtype=torch.float16
|
||||
)
|
||||
pipeline = StableDiffusionXLControlNetPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
controlnet=controlnet,
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
images = [openpose_image.resize((1024, 1024)), canny_image.resize((1024, 1024))]
|
||||
|
||||
images = pipe(
|
||||
prompt,
|
||||
image=images,
|
||||
num_inference_steps=25,
|
||||
generator=generator,
|
||||
negative_prompt=negative_prompt,
|
||||
num_images_per_prompt=3,
|
||||
controlnet_conditioning_scale=[1.0, 0.8],
|
||||
).images
|
||||
make_image_grid([original_image, canny_image, openpose_image,
|
||||
images[0].resize((512, 512)), images[1].resize((512, 512)), images[2].resize((512, 512))], rows=2, cols=3)
|
||||
canny_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png")
|
||||
pipeline(
|
||||
"",
|
||||
image=canny_image,
|
||||
guess_mode=True
|
||||
).images[0]
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/multicontrolnet.png"/>
|
||||
</div>
|
||||
<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png" width="300" alt="Control image (Canny edges)"/>
|
||||
<figcaption style="text-align: center;">canny image</figcaption>
|
||||
</figure>
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/guess_mode.png" width="300" alt="Generated image (Guess mode)"/>
|
||||
<figcaption style="text-align: center;">generated image</figcaption>
|
||||
</figure>
|
||||
</div>
|
||||
@@ -0,0 +1,35 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# DreamBooth
|
||||
|
||||
[DreamBooth](https://huggingface.co/papers/2208.12242) is a method for generating personalized images of a specific instance. It works by fine-tuning the model on 3-5 images of the subject (for example, a cat) that is associated with a unique identifier (`sks cat`). This allows you to use `sks cat` in your prompt to trigger the model to generate images of your cat in different settings, lighting, poses, and styles.
|
||||
|
||||
DreamBooth checkpoints are typically a few GBs in size because it contains the full model weights.
|
||||
|
||||
Load the DreamBooth checkpoint with [`~DiffusionPipeline.from_pretrained`] and include the unique identifier in the prompt to activate its generation.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"sd-dreambooth-library/herge-style",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
prompt = "A cute sks herge_style brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration"
|
||||
pipeline(prompt).images[0]
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_dreambooth.png" />
|
||||
</div>
|
||||
@@ -485,7 +485,7 @@ image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image).resize((1024, 1216))
|
||||
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, variant="fp16").to("cuda")
|
||||
|
||||
unet = UNet2DConditionModel.from_pretrained(
|
||||
"latent-consistency/lcm-sdxl",
|
||||
@@ -551,7 +551,7 @@ image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image).resize((1024, 1024))
|
||||
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, varient="fp16").to("cuda")
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16, variant="fp16").to("cuda")
|
||||
|
||||
pipe = StableDiffusionXLAdapterPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,416 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Load adapters
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
There are several [training](../training/overview) techniques for personalizing diffusion models to generate images of a specific subject or images in certain styles. Each of these training methods produces a different type of adapter. Some of the adapters generate an entirely new model, while other adapters only modify a smaller set of embeddings or weights. This means the loading process for each adapter is also different.
|
||||
|
||||
This guide will show you how to load DreamBooth, textual inversion, and LoRA weights.
|
||||
|
||||
<Tip>
|
||||
|
||||
Feel free to browse the [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer), [LoRA the Explorer](https://huggingface.co/spaces/multimodalart/LoraTheExplorer), and the [Diffusers Models Gallery](https://huggingface.co/spaces/huggingface-projects/diffusers-gallery) for checkpoints and embeddings to use.
|
||||
|
||||
</Tip>
|
||||
|
||||
## DreamBooth
|
||||
|
||||
[DreamBooth](https://dreambooth.github.io/) finetunes an *entire diffusion model* on just several images of a subject to generate images of that subject in new styles and settings. This method works by using a special word in the prompt that the model learns to associate with the subject image. Of all the training methods, DreamBooth produces the largest file size (usually a few GBs) because it is a full checkpoint model.
|
||||
|
||||
Let's load the [herge_style](https://huggingface.co/sd-dreambooth-library/herge-style) checkpoint, which is trained on just 10 images drawn by Hergé, to generate images in that style. For it to work, you need to include the special word `herge_style` in your prompt to trigger the checkpoint:
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("sd-dreambooth-library/herge-style", torch_dtype=torch.float16).to("cuda")
|
||||
prompt = "A cute herge_style brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration"
|
||||
image = pipeline(prompt).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_dreambooth.png" />
|
||||
</div>
|
||||
|
||||
## Textual inversion
|
||||
|
||||
[Textual inversion](https://textual-inversion.github.io/) is very similar to DreamBooth and it can also personalize a diffusion model to generate certain concepts (styles, objects) from just a few images. This method works by training and finding new embeddings that represent the images you provide with a special word in the prompt. As a result, the diffusion model weights stay the same and the training process produces a relatively tiny (a few KBs) file.
|
||||
|
||||
Because textual inversion creates embeddings, it cannot be used on its own like DreamBooth and requires another model.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
|
||||
```
|
||||
|
||||
Now you can load the textual inversion embeddings with the [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] method and generate some images. Let's load the [sd-concepts-library/gta5-artwork](https://huggingface.co/sd-concepts-library/gta5-artwork) embeddings and you'll need to include the special word `<gta5-artwork>` in your prompt to trigger it:
|
||||
|
||||
```py
|
||||
pipeline.load_textual_inversion("sd-concepts-library/gta5-artwork")
|
||||
prompt = "A cute brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration, <gta5-artwork> style"
|
||||
image = pipeline(prompt).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_txt_embed.png" />
|
||||
</div>
|
||||
|
||||
Textual inversion can also be trained on undesirable things to create *negative embeddings* to discourage a model from generating images with those undesirable things like blurry images or extra fingers on a hand. This can be an easy way to quickly improve your prompt. You'll also load the embeddings with [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`], but this time, you'll need two more parameters:
|
||||
|
||||
- `weight_name`: specifies the weight file to load if the file was saved in the 🤗 Diffusers format with a specific name or if the file is stored in the A1111 format
|
||||
- `token`: specifies the special word to use in the prompt to trigger the embeddings
|
||||
|
||||
Let's load the [sayakpaul/EasyNegative-test](https://huggingface.co/sayakpaul/EasyNegative-test) embeddings:
|
||||
|
||||
```py
|
||||
pipeline.load_textual_inversion(
|
||||
"sayakpaul/EasyNegative-test", weight_name="EasyNegative.safetensors", token="EasyNegative"
|
||||
)
|
||||
```
|
||||
|
||||
Now you can use the `token` to generate an image with the negative embeddings:
|
||||
|
||||
```py
|
||||
prompt = "A cute brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration, EasyNegative"
|
||||
negative_prompt = "EasyNegative"
|
||||
|
||||
image = pipeline(prompt, negative_prompt=negative_prompt, num_inference_steps=50).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png" />
|
||||
</div>
|
||||
|
||||
## LoRA
|
||||
|
||||
[Low-Rank Adaptation (LoRA)](https://huggingface.co/papers/2106.09685) is a popular training technique because it is fast and generates smaller file sizes (a couple hundred MBs). Like the other methods in this guide, LoRA can train a model to learn new styles from just a few images. It works by inserting new weights into the diffusion model and then only the new weights are trained instead of the entire model. This makes LoRAs faster to train and easier to store.
|
||||
|
||||
<Tip>
|
||||
|
||||
LoRA is a very general training technique that can be used with other training methods. For example, it is common to train a model with DreamBooth and LoRA. It is also increasingly common to load and merge multiple LoRAs to create new and unique images. You can learn more about it in the in-depth [Merge LoRAs](merge_loras) guide since merging is outside the scope of this loading guide.
|
||||
|
||||
</Tip>
|
||||
|
||||
LoRAs also need to be used with another model:
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
```
|
||||
|
||||
Then use the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method to load the [ostris/super-cereal-sdxl-lora](https://huggingface.co/ostris/super-cereal-sdxl-lora) weights and specify the weights filename from the repository:
|
||||
|
||||
```py
|
||||
pipeline.load_lora_weights("ostris/super-cereal-sdxl-lora", weight_name="cereal_box_sdxl_v1.safetensors")
|
||||
prompt = "bears, pizza bites"
|
||||
image = pipeline(prompt).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_lora.png" />
|
||||
</div>
|
||||
|
||||
The [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method loads LoRA weights into both the UNet and text encoder. It is the preferred way for loading LoRAs because it can handle cases where:
|
||||
|
||||
- the LoRA weights don't have separate identifiers for the UNet and text encoder
|
||||
- the LoRA weights have separate identifiers for the UNet and text encoder
|
||||
|
||||
To directly load (and save) a LoRA adapter at the *model-level*, use [`~loaders.PeftAdapterMixin.load_lora_adapter`], which builds and prepares the necessary model configuration for the adapter. Like [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`], [`~loaders.PeftAdapterMixin.load_lora_adapter`] can load LoRAs for both the UNet and text encoder. For example, if you're loading a LoRA for the UNet, [`~loaders.PeftAdapterMixin.load_lora_adapter`] ignores the keys for the text encoder.
|
||||
|
||||
Use the `weight_name` parameter to specify the specific weight file and the `prefix` parameter to filter for the appropriate state dicts (`"unet"` in this case) to load.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.unet.load_lora_adapter("jbilcke-hf/sdxl-cinematic-1", weight_name="pytorch_lora_weights.safetensors", prefix="unet")
|
||||
|
||||
# use cnmt in the prompt to trigger the LoRA
|
||||
prompt = "A cute cnmt eating a slice of pizza, stunning color scheme, masterpiece, illustration"
|
||||
image = pipeline(prompt).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_attn_proc.png" />
|
||||
</div>
|
||||
|
||||
Save an adapter with [`~loaders.PeftAdapterMixin.save_lora_adapter`].
|
||||
|
||||
To unload the LoRA weights, use the [`~loaders.StableDiffusionLoraLoaderMixin.unload_lora_weights`] method to discard the LoRA weights and restore the model to its original weights:
|
||||
|
||||
```py
|
||||
pipeline.unload_lora_weights()
|
||||
```
|
||||
|
||||
### Adjust LoRA weight scale
|
||||
|
||||
For both [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] and [`~loaders.UNet2DConditionLoadersMixin.load_attn_procs`], you can pass the `cross_attention_kwargs={"scale": 0.5}` parameter to adjust how much of the LoRA weights to use. A value of `0` is the same as only using the base model weights, and a value of `1` is equivalent to using the fully finetuned LoRA.
|
||||
|
||||
For more granular control on the amount of LoRA weights used per layer, you can use [`~loaders.StableDiffusionLoraLoaderMixin.set_adapters`] and pass a dictionary specifying by how much to scale the weights in each layer by.
|
||||
```python
|
||||
pipe = ... # create pipeline
|
||||
pipe.load_lora_weights(..., adapter_name="my_adapter")
|
||||
scales = {
|
||||
"text_encoder": 0.5,
|
||||
"text_encoder_2": 0.5, # only usable if pipe has a 2nd text encoder
|
||||
"unet": {
|
||||
"down": 0.9, # all transformers in the down-part will use scale 0.9
|
||||
# "mid" # in this example "mid" is not given, therefore all transformers in the mid part will use the default scale 1.0
|
||||
"up": {
|
||||
"block_0": 0.6, # all 3 transformers in the 0th block in the up-part will use scale 0.6
|
||||
"block_1": [0.4, 0.8, 1.0], # the 3 transformers in the 1st block in the up-part will use scales 0.4, 0.8 and 1.0 respectively
|
||||
}
|
||||
}
|
||||
}
|
||||
pipe.set_adapters("my_adapter", scales)
|
||||
```
|
||||
|
||||
This also works with multiple adapters - see [this guide](https://huggingface.co/docs/diffusers/tutorials/using_peft_for_inference#customize-adapters-strength) for how to do it.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Currently, [`~loaders.StableDiffusionLoraLoaderMixin.set_adapters`] only supports scaling attention weights. If a LoRA has other parts (e.g., resnets or down-/upsamplers), they will keep a scale of 1.0.
|
||||
|
||||
</Tip>
|
||||
|
||||
### Hotswapping LoRA adapters
|
||||
|
||||
A common use case when serving multiple adapters is to load one adapter first, generate images, load another adapter, generate more images, load another adapter, etc. This workflow normally requires calling [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`], [`~loaders.StableDiffusionLoraLoaderMixin.set_adapters`], and possibly [`~loaders.peft.PeftAdapterMixin.delete_adapters`] to save memory. Moreover, if the model is compiled using `torch.compile`, performing these steps requires recompilation, which takes time.
|
||||
|
||||
To better support this common workflow, you can "hotswap" a LoRA adapter, to avoid accumulating memory and in some cases, recompilation. It requires an adapter to already be loaded, and the new adapter weights are swapped in-place for the existing adapter.
|
||||
|
||||
Pass `hotswap=True` when loading a LoRA adapter to enable this feature. It is important to indicate the name of the existing adapter, (`default_0` is the default adapter name), to be swapped. If you loaded the first adapter with a different name, use that name instead.
|
||||
|
||||
```python
|
||||
pipe = ...
|
||||
# load adapter 1 as normal
|
||||
pipeline.load_lora_weights(file_name_adapter_1)
|
||||
# generate some images with adapter 1
|
||||
...
|
||||
# now hot swap the 2nd adapter
|
||||
pipeline.load_lora_weights(file_name_adapter_2, hotswap=True, adapter_name="default_0")
|
||||
# generate images with adapter 2
|
||||
```
|
||||
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Hotswapping is not currently supported for LoRA adapters that target the text encoder.
|
||||
|
||||
</Tip>
|
||||
|
||||
For compiled models, it is often (though not always if the second adapter targets identical LoRA ranks and scales) necessary to call [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`] to avoid recompilation. Use [`~loaders.lora_base.LoraBaseMixin.enable_lora_hotswap`] _before_ loading the first adapter, and `torch.compile` should be called _after_ loading the first adapter.
|
||||
|
||||
```python
|
||||
pipe = ...
|
||||
# call this extra method
|
||||
pipe.enable_lora_hotswap(target_rank=max_rank)
|
||||
# now load adapter 1
|
||||
pipe.load_lora_weights(file_name_adapter_1)
|
||||
# now compile the unet of the pipeline
|
||||
pipe.unet = torch.compile(pipeline.unet, ...)
|
||||
# generate some images with adapter 1
|
||||
...
|
||||
# now hot swap adapter 2
|
||||
pipeline.load_lora_weights(file_name_adapter_2, hotswap=True, adapter_name="default_0")
|
||||
# generate images with adapter 2
|
||||
```
|
||||
|
||||
The `target_rank=max_rank` argument is important for setting the maximum rank among all LoRA adapters that will be loaded. If you have one adapter with rank 8 and another with rank 16, pass `target_rank=16`. You should use a higher value if in doubt. By default, this value is 128.
|
||||
|
||||
However, there can be situations where recompilation is unavoidable. For example, if the hotswapped adapter targets more layers than the initial adapter, then recompilation is triggered. Try to load the adapter that targets the most layers first. Refer to the PEFT docs on [hotswapping](https://huggingface.co/docs/peft/main/en/package_reference/hotswap#peft.utils.hotswap.hotswap_adapter) for more details about the limitations of this feature.
|
||||
|
||||
<Tip>
|
||||
|
||||
Move your code inside the `with torch._dynamo.config.patch(error_on_recompile=True)` context manager to detect if a model was recompiled. If you detect recompilation despite following all the steps above, please open an issue with [Diffusers](https://github.com/huggingface/diffusers/issues) with a reproducible example.
|
||||
|
||||
</Tip>
|
||||
|
||||
### Kohya and TheLastBen
|
||||
|
||||
Other popular LoRA trainers from the community include those by [Kohya](https://github.com/kohya-ss/sd-scripts/) and [TheLastBen](https://github.com/TheLastBen/fast-stable-diffusion). These trainers create different LoRA checkpoints than those trained by 🤗 Diffusers, but they can still be loaded in the same way.
|
||||
|
||||
<hfoptions id="other-trainers">
|
||||
<hfoption id="Kohya">
|
||||
|
||||
To load a Kohya LoRA, let's download the [Blueprintify SD XL 1.0](https://civitai.com/models/150986/blueprintify-sd-xl-10) checkpoint from [Civitai](https://civitai.com/) as an example:
|
||||
|
||||
```sh
|
||||
!wget https://civitai.com/api/download/models/168776 -O blueprintify-sd-xl-10.safetensors
|
||||
```
|
||||
|
||||
Load the LoRA checkpoint with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method, and specify the filename in the `weight_name` parameter:
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_lora_weights("path/to/weights", weight_name="blueprintify-sd-xl-10.safetensors")
|
||||
```
|
||||
|
||||
Generate an image:
|
||||
|
||||
```py
|
||||
# use bl3uprint in the prompt to trigger the LoRA
|
||||
prompt = "bl3uprint, a highly detailed blueprint of the eiffel tower, explaining how to build all parts, many txt, blueprint grid backdrop"
|
||||
image = pipeline(prompt).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
Some limitations of using Kohya LoRAs with 🤗 Diffusers include:
|
||||
|
||||
- Images may not look like those generated by UIs - like ComfyUI - for multiple reasons, which are explained [here](https://github.com/huggingface/diffusers/pull/4287/#issuecomment-1655110736).
|
||||
- [LyCORIS checkpoints](https://github.com/KohakuBlueleaf/LyCORIS) aren't fully supported. The [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method loads LyCORIS checkpoints with LoRA and LoCon modules, but Hada and LoKR are not supported.
|
||||
|
||||
</Tip>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="TheLastBen">
|
||||
|
||||
Loading a checkpoint from TheLastBen is very similar. For example, to load the [TheLastBen/William_Eggleston_Style_SDXL](https://huggingface.co/TheLastBen/William_Eggleston_Style_SDXL) checkpoint:
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_lora_weights("TheLastBen/William_Eggleston_Style_SDXL", weight_name="wegg.safetensors")
|
||||
|
||||
# use by william eggleston in the prompt to trigger the LoRA
|
||||
prompt = "a house by william eggleston, sunrays, beautiful, sunlight, sunrays, beautiful"
|
||||
image = pipeline(prompt=prompt).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## IP-Adapter
|
||||
|
||||
[IP-Adapter](https://ip-adapter.github.io/) is a lightweight adapter that enables image prompting for any diffusion model. This adapter works by decoupling the cross-attention layers of the image and text features. All the other model components are frozen and only the embedded image features in the UNet are trained. As a result, IP-Adapter files are typically only ~100MBs.
|
||||
|
||||
You can learn more about how to use IP-Adapter for different tasks and specific use cases in the [IP-Adapter](../using-diffusers/ip_adapter) guide.
|
||||
|
||||
> [!TIP]
|
||||
> Diffusers currently only supports IP-Adapter for some of the most popular pipelines. Feel free to open a feature request if you have a cool use case and want to integrate IP-Adapter with an unsupported pipeline!
|
||||
> Official IP-Adapter checkpoints are available from [h94/IP-Adapter](https://huggingface.co/h94/IP-Adapter).
|
||||
|
||||
To start, load a Stable Diffusion checkpoint.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
from diffusers.utils import load_image
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16).to("cuda")
|
||||
```
|
||||
|
||||
Then load the IP-Adapter weights and add it to the pipeline with the [`~loaders.IPAdapterMixin.load_ip_adapter`] method.
|
||||
|
||||
```py
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
```
|
||||
|
||||
Once loaded, you can use the pipeline with an image and text prompt to guide the image generation process.
|
||||
|
||||
```py
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png")
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
images = pipeline(
|
||||
prompt='best quality, high quality, wearing sunglasses',
|
||||
ip_adapter_image=image,
|
||||
negative_prompt="monochrome, lowres, bad anatomy, worst quality, low quality",
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
images
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/ip-bear.png" />
|
||||
</div>
|
||||
|
||||
### IP-Adapter Plus
|
||||
|
||||
IP-Adapter relies on an image encoder to generate image features. If the IP-Adapter repository contains an `image_encoder` subfolder, the image encoder is automatically loaded and registered to the pipeline. Otherwise, you'll need to explicitly load the image encoder with a [`~transformers.CLIPVisionModelWithProjection`] model and pass it to the pipeline.
|
||||
|
||||
This is the case for *IP-Adapter Plus* checkpoints which use the ViT-H image encoder.
|
||||
|
||||
```py
|
||||
from transformers import CLIPVisionModelWithProjection
|
||||
|
||||
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
||||
"h94/IP-Adapter",
|
||||
subfolder="models/image_encoder",
|
||||
torch_dtype=torch.float16
|
||||
)
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
image_encoder=image_encoder,
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter-plus_sdxl_vit-h.safetensors")
|
||||
```
|
||||
|
||||
### IP-Adapter Face ID models
|
||||
|
||||
The IP-Adapter FaceID models are experimental IP Adapters that use image embeddings generated by `insightface` instead of CLIP image embeddings. Some of these models also use LoRA to improve ID consistency.
|
||||
You need to install `insightface` and all its requirements to use these models.
|
||||
|
||||
<Tip warning={true}>
|
||||
As InsightFace pretrained models are available for non-commercial research purposes, IP-Adapter-FaceID models are released exclusively for research purposes and are not intended for commercial use.
|
||||
</Tip>
|
||||
|
||||
```py
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid_sdxl.bin", image_encoder_folder=None)
|
||||
```
|
||||
|
||||
If you want to use one of the two IP-Adapter FaceID Plus models, you must also load the CLIP image encoder, as this models use both `insightface` and CLIP image embeddings to achieve better photorealism.
|
||||
|
||||
```py
|
||||
from transformers import CLIPVisionModelWithProjection
|
||||
|
||||
image_encoder = CLIPVisionModelWithProjection.from_pretrained(
|
||||
"laion/CLIP-ViT-H-14-laion2B-s32B-b79K",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
image_encoder=image_encoder,
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter-FaceID", subfolder=None, weight_name="ip-adapter-faceid-plus_sd15.bin")
|
||||
```
|
||||
@@ -1,266 +0,0 @@
|
||||
<!--Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Merge LoRAs
|
||||
|
||||
It can be fun and creative to use multiple [LoRAs]((https://huggingface.co/docs/peft/conceptual_guides/adapter#low-rank-adaptation-lora)) together to generate something entirely new and unique. This works by merging multiple LoRA weights together to produce images that are a blend of different styles. Diffusers provides a few methods to merge LoRAs depending on *how* you want to merge their weights, which can affect image quality.
|
||||
|
||||
This guide will show you how to merge LoRAs using the [`~loaders.PeftAdapterMixin.set_adapters`] and [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) methods. To improve inference speed and reduce memory-usage of merged LoRAs, you'll also see how to use the [`~loaders.StableDiffusionLoraLoaderMixin.fuse_lora`] method to fuse the LoRA weights with the original weights of the underlying model.
|
||||
|
||||
For this guide, load a Stable Diffusion XL (SDXL) checkpoint and the [KappaNeuro/studio-ghibli-style](https://huggingface.co/KappaNeuro/studio-ghibli-style) and [Norod78/sdxl-chalkboarddrawing-lora](https://huggingface.co/Norod78/sdxl-chalkboarddrawing-lora) LoRAs with the [`~loaders.StableDiffusionLoraLoaderMixin.load_lora_weights`] method. You'll need to assign each LoRA an `adapter_name` to combine them later.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_lora_weights("ostris/ikea-instructions-lora-sdxl", weight_name="ikea_instructions_xl_v1_5.safetensors", adapter_name="ikea")
|
||||
pipeline.load_lora_weights("lordjia/by-feng-zikai", weight_name="fengzikai_v1.0_XL.safetensors", adapter_name="feng")
|
||||
```
|
||||
|
||||
## set_adapters
|
||||
|
||||
The [`~loaders.PeftAdapterMixin.set_adapters`] method merges LoRA adapters by concatenating their weighted matrices. Use the adapter name to specify which LoRAs to merge, and the `adapter_weights` parameter to control the scaling for each LoRA. For example, if `adapter_weights=[0.5, 0.5]`, then the merged LoRA output is an average of both LoRAs. Try adjusting the adapter weights to see how it affects the generated image!
|
||||
|
||||
```py
|
||||
pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
|
||||
|
||||
generator = torch.manual_seed(0)
|
||||
prompt = "A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai"
|
||||
image = pipeline(prompt, generator=generator, cross_attention_kwargs={"scale": 1.0}).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/lora_merge_set_adapters.png"/>
|
||||
</div>
|
||||
|
||||
## add_weighted_adapter
|
||||
|
||||
> [!WARNING]
|
||||
> This is an experimental method that adds PEFTs [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) method to Diffusers to enable more efficient merging methods. Check out this [issue](https://github.com/huggingface/diffusers/issues/6892) if you're interested in learning more about the motivation and design behind this integration.
|
||||
|
||||
The [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) method provides access to more efficient merging method such as [TIES and DARE](https://huggingface.co/docs/peft/developer_guides/model_merging). To use these merging methods, make sure you have the latest stable version of Diffusers and PEFT installed.
|
||||
|
||||
```bash
|
||||
pip install -U diffusers peft
|
||||
```
|
||||
|
||||
There are three steps to merge LoRAs with the [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) method:
|
||||
|
||||
1. Create a [PeftModel](https://huggingface.co/docs/peft/package_reference/peft_model#peft.PeftModel) from the underlying model and LoRA checkpoint.
|
||||
2. Load a base UNet model and the LoRA adapters.
|
||||
3. Merge the adapters using the [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) method and the merging method of your choice.
|
||||
|
||||
Let's dive deeper into what these steps entail.
|
||||
|
||||
1. Load a UNet that corresponds to the UNet in the LoRA checkpoint. In this case, both LoRAs use the SDXL UNet as their base model.
|
||||
|
||||
```python
|
||||
from diffusers import AutoModel
|
||||
import torch
|
||||
|
||||
unet = AutoModel.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
variant="fp16",
|
||||
subfolder="unet",
|
||||
).to("cuda")
|
||||
```
|
||||
|
||||
Load the SDXL pipeline and the LoRA checkpoints, starting with the [ostris/ikea-instructions-lora-sdxl](https://huggingface.co/ostris/ikea-instructions-lora-sdxl) LoRA.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
variant="fp16",
|
||||
torch_dtype=torch.float16,
|
||||
unet=unet
|
||||
).to("cuda")
|
||||
pipeline.load_lora_weights("ostris/ikea-instructions-lora-sdxl", weight_name="ikea_instructions_xl_v1_5.safetensors", adapter_name="ikea")
|
||||
```
|
||||
|
||||
Now you'll create a [PeftModel](https://huggingface.co/docs/peft/package_reference/peft_model#peft.PeftModel) from the loaded LoRA checkpoint by combining the SDXL UNet and the LoRA UNet from the pipeline.
|
||||
|
||||
```python
|
||||
from peft import get_peft_model, LoraConfig
|
||||
import copy
|
||||
|
||||
sdxl_unet = copy.deepcopy(unet)
|
||||
ikea_peft_model = get_peft_model(
|
||||
sdxl_unet,
|
||||
pipeline.unet.peft_config["ikea"],
|
||||
adapter_name="ikea"
|
||||
)
|
||||
|
||||
original_state_dict = {f"base_model.model.{k}": v for k, v in pipeline.unet.state_dict().items()}
|
||||
ikea_peft_model.load_state_dict(original_state_dict, strict=True)
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> You can optionally push the ikea_peft_model to the Hub by calling `ikea_peft_model.push_to_hub("ikea_peft_model", token=TOKEN)`.
|
||||
|
||||
Repeat this process to create a [PeftModel](https://huggingface.co/docs/peft/package_reference/peft_model#peft.PeftModel) from the [lordjia/by-feng-zikai](https://huggingface.co/lordjia/by-feng-zikai) LoRA.
|
||||
|
||||
```python
|
||||
pipeline.delete_adapters("ikea")
|
||||
sdxl_unet.delete_adapters("ikea")
|
||||
|
||||
pipeline.load_lora_weights("lordjia/by-feng-zikai", weight_name="fengzikai_v1.0_XL.safetensors", adapter_name="feng")
|
||||
pipeline.set_adapters(adapter_names="feng")
|
||||
|
||||
feng_peft_model = get_peft_model(
|
||||
sdxl_unet,
|
||||
pipeline.unet.peft_config["feng"],
|
||||
adapter_name="feng"
|
||||
)
|
||||
|
||||
original_state_dict = {f"base_model.model.{k}": v for k, v in pipe.unet.state_dict().items()}
|
||||
feng_peft_model.load_state_dict(original_state_dict, strict=True)
|
||||
```
|
||||
|
||||
2. Load a base UNet model and then load the adapters onto it.
|
||||
|
||||
```python
|
||||
from peft import PeftModel
|
||||
|
||||
base_unet = AutoModel.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
use_safetensors=True,
|
||||
variant="fp16",
|
||||
subfolder="unet",
|
||||
).to("cuda")
|
||||
|
||||
model = PeftModel.from_pretrained(base_unet, "stevhliu/ikea_peft_model", use_safetensors=True, subfolder="ikea", adapter_name="ikea")
|
||||
model.load_adapter("stevhliu/feng_peft_model", use_safetensors=True, subfolder="feng", adapter_name="feng")
|
||||
```
|
||||
|
||||
3. Merge the adapters using the [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) method and the merging method of your choice (learn more about other merging methods in this [blog post](https://huggingface.co/blog/peft_merging)). For this example, let's use the `"dare_linear"` method to merge the LoRAs.
|
||||
|
||||
> [!WARNING]
|
||||
> Keep in mind the LoRAs need to have the same rank to be merged!
|
||||
|
||||
```python
|
||||
model.add_weighted_adapter(
|
||||
adapters=["ikea", "feng"],
|
||||
weights=[1.0, 1.0],
|
||||
combination_type="dare_linear",
|
||||
adapter_name="ikea-feng"
|
||||
)
|
||||
model.set_adapters("ikea-feng")
|
||||
```
|
||||
|
||||
Now you can generate an image with the merged LoRA.
|
||||
|
||||
```python
|
||||
model = model.to(dtype=torch.float16, device="cuda")
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", unet=model, variant="fp16", torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
|
||||
image = pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai", generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ikea-feng-dare-linear.png"/>
|
||||
</div>
|
||||
|
||||
## fuse_lora
|
||||
|
||||
Both the [`~loaders.PeftAdapterMixin.set_adapters`] and [add_weighted_adapter](https://huggingface.co/docs/peft/package_reference/lora#peft.LoraModel.add_weighted_adapter) methods require loading the base model and the LoRA adapters separately which incurs some overhead. The [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] method allows you to fuse the LoRA weights directly with the original weights of the underlying model. This way, you're only loading the model once which can increase inference and lower memory-usage.
|
||||
|
||||
You can use PEFT to easily fuse/unfuse multiple adapters directly into the model weights (both UNet and text encoder) using the [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] method, which can lead to a speed-up in inference and lower VRAM usage.
|
||||
|
||||
For example, if you have a base model and adapters loaded and set as active with the following adapter weights:
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_lora_weights("ostris/ikea-instructions-lora-sdxl", weight_name="ikea_instructions_xl_v1_5.safetensors", adapter_name="ikea")
|
||||
pipeline.load_lora_weights("lordjia/by-feng-zikai", weight_name="fengzikai_v1.0_XL.safetensors", adapter_name="feng")
|
||||
|
||||
pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
|
||||
```
|
||||
|
||||
Fuse these LoRAs into the UNet with the [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] method. The `lora_scale` parameter controls how much to scale the output by with the LoRA weights. It is important to make the `lora_scale` adjustments in the [`~loaders.lora_base.LoraBaseMixin.fuse_lora`] method because it won’t work if you try to pass `scale` to the `cross_attention_kwargs` in the pipeline.
|
||||
|
||||
```py
|
||||
pipeline.fuse_lora(adapter_names=["ikea", "feng"], lora_scale=1.0)
|
||||
```
|
||||
|
||||
Then you should use [`~loaders.StableDiffusionLoraLoaderMixin.unload_lora_weights`] to unload the LoRA weights since they've already been fused with the underlying base model. Finally, call [`~DiffusionPipeline.save_pretrained`] to save the fused pipeline locally or you could call [`~DiffusionPipeline.push_to_hub`] to push the fused pipeline to the Hub.
|
||||
|
||||
```py
|
||||
pipeline.unload_lora_weights()
|
||||
# save locally
|
||||
pipeline.save_pretrained("path/to/fused-pipeline")
|
||||
# save to the Hub
|
||||
pipeline.push_to_hub("fused-ikea-feng")
|
||||
```
|
||||
|
||||
Now you can quickly load the fused pipeline and use it for inference without needing to separately load the LoRA adapters.
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"username/fused-ikea-feng", torch_dtype=torch.float16,
|
||||
).to("cuda")
|
||||
|
||||
image = pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai", generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
You can call [`~~loaders.lora_base.LoraBaseMixin.unfuse_lora`] to restore the original model's weights (for example, if you want to use a different `lora_scale` value). However, this only works if you've only fused one LoRA adapter to the original model. If you've fused multiple LoRAs, you'll need to reload the model.
|
||||
|
||||
```py
|
||||
pipeline.unfuse_lora()
|
||||
```
|
||||
|
||||
### torch.compile
|
||||
|
||||
[torch.compile](../optimization/torch2.0#torchcompile) can speed up your pipeline even more, but the LoRA weights must be fused first and then unloaded. Typically, the UNet is compiled because it is such a computationally intensive component of the pipeline.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
# load base model and LoRAs
|
||||
pipeline = DiffusionPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_lora_weights("ostris/ikea-instructions-lora-sdxl", weight_name="ikea_instructions_xl_v1_5.safetensors", adapter_name="ikea")
|
||||
pipeline.load_lora_weights("lordjia/by-feng-zikai", weight_name="fengzikai_v1.0_XL.safetensors", adapter_name="feng")
|
||||
|
||||
# activate both LoRAs and set adapter weights
|
||||
pipeline.set_adapters(["ikea", "feng"], adapter_weights=[0.7, 0.8])
|
||||
|
||||
# fuse LoRAs and unload weights
|
||||
pipeline.fuse_lora(adapter_names=["ikea", "feng"], lora_scale=1.0)
|
||||
pipeline.unload_lora_weights()
|
||||
|
||||
# torch.compile
|
||||
pipeline.unet.to(memory_format=torch.channels_last)
|
||||
pipeline.unet = torch.compile(pipeline.unet, mode="reduce-overhead", fullgraph=True)
|
||||
|
||||
image = pipeline("A bowl of ramen shaped like a cute kawaii bear, by Feng Zikai", generator=torch.manual_seed(0)).images[0]
|
||||
```
|
||||
|
||||
Learn more about torch.compile in the [Accelerate inference of text-to-image diffusion models](../tutorials/fast_diffusion#torchcompile) guide.
|
||||
|
||||
## Next steps
|
||||
|
||||
For more conceptual details about how each merging method works, take a look at the [🤗 PEFT welcomes new merging methods](https://huggingface.co/blog/peft_merging#concatenation-cat) blog post!
|
||||
@@ -154,11 +154,11 @@ pipeline = AutoPipelineForInpainting.from_pretrained(
|
||||
pipeline.enable_model_cpu_offload()
|
||||
```
|
||||
|
||||
You can enable PAG on an exisiting inpainting pipeline like this
|
||||
You can enable PAG on an existing inpainting pipeline like this
|
||||
|
||||
```py
|
||||
pipeline_inpaint = AutoPipelineForInpaiting.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
|
||||
pipeline = AutoPipelineForInpaiting.from_pipe(pipeline_inpaint, enable_pag=True)
|
||||
pipeline_inpaint = AutoPipelineForInpainting.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16)
|
||||
pipeline = AutoPipelineForInpainting.from_pipe(pipeline_inpaint, enable_pag=True)
|
||||
```
|
||||
|
||||
This still works when your pipeline has a different task:
|
||||
|
||||
@@ -12,41 +12,21 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# T2I-Adapter
|
||||
|
||||
[T2I-Adapter](https://hf.co/papers/2302.08453) is a lightweight adapter for controlling and providing more accurate
|
||||
structure guidance for text-to-image models. It works by learning an alignment between the internal knowledge of the
|
||||
text-to-image model and an external control signal, such as edge detection or depth estimation.
|
||||
[T2I-Adapter](https://huggingface.co/papers/2302.08453) is an adapter that enables controllable generation like [ControlNet](./controlnet). A T2I-Adapter works by learning a *mapping* between a control signal (for example, a depth map) and a pretrained model's internal knowledge. The adapter is plugged in to the base model to provide extra guidance based on the control signal during generation.
|
||||
|
||||
The T2I-Adapter design is simple, the condition is passed to four feature extraction blocks and three downsample
|
||||
blocks. This makes it fast and easy to train different adapters for different conditions which can be plugged into the
|
||||
text-to-image model. T2I-Adapter is similar to [ControlNet](controlnet) except it is smaller (~77M parameters) and
|
||||
faster because it only runs once during the diffusion process. The downside is that performance may be slightly worse
|
||||
than ControlNet.
|
||||
|
||||
This guide will show you how to use T2I-Adapter with different Stable Diffusion models and how you can compose multiple
|
||||
T2I-Adapters to impose more than one condition.
|
||||
|
||||
> [!TIP]
|
||||
> There are several T2I-Adapters available for different conditions, such as color palette, depth, sketch, pose, and
|
||||
> segmentation. Check out the [TencentARC](https://hf.co/TencentARC) repository to try them out!
|
||||
|
||||
Before you begin, make sure you have the following libraries installed.
|
||||
Load a T2I-Adapter conditioned on a specific control, such as canny edge, and pass it to the pipeline in [`~DiffusionPipeline.from_pretrained`].
|
||||
|
||||
```py
|
||||
# uncomment to install the necessary libraries in Colab
|
||||
#!pip install -q diffusers accelerate controlnet-aux==0.0.7
|
||||
import torch
|
||||
from diffusers import T2IAdapter, StableDiffusionXLAdapterPipeline, AutoencoderKL
|
||||
|
||||
t2i_adapter = T2IAdapter.from_pretrained(
|
||||
"TencentARC/t2i-adapter-canny-sdxl-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
```
|
||||
|
||||
## Text-to-image
|
||||
|
||||
Text-to-image models rely on a prompt to generate an image, but sometimes, text alone may not be enough to provide more
|
||||
accurate structural guidance. T2I-Adapter allows you to provide an additional control image to guide the generation
|
||||
process. For example, you can provide a canny image (a white outline of an image on a black background) to guide the
|
||||
model to generate an image with a similar structure.
|
||||
|
||||
<hfoptions id="stablediffusion">
|
||||
<hfoption id="Stable Diffusion 1.5">
|
||||
|
||||
Create a canny image with the [opencv-library](https://github.com/opencv/opencv-python).
|
||||
Generate a canny image with [opencv-python](https://github.com/opencv/opencv-python).
|
||||
|
||||
```py
|
||||
import cv2
|
||||
@@ -54,166 +34,124 @@ import numpy as np
|
||||
from PIL import Image
|
||||
from diffusers.utils import load_image
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png")
|
||||
image = np.array(image)
|
||||
original_image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png"
|
||||
)
|
||||
|
||||
image = np.array(original_image)
|
||||
|
||||
low_threshold = 100
|
||||
high_threshold = 200
|
||||
|
||||
image = cv2.Canny(image, low_threshold, high_threshold)
|
||||
image = Image.fromarray(image)
|
||||
image = image[:, :, None]
|
||||
image = np.concatenate([image, image, image], axis=2)
|
||||
canny_image = Image.fromarray(image)
|
||||
```
|
||||
|
||||
Now load a T2I-Adapter conditioned on [canny images](https://hf.co/TencentARC/t2iadapter_canny_sd15v2) and pass it to
|
||||
the [`StableDiffusionAdapterPipeline`].
|
||||
Pass the canny image to the pipeline to generate an image.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionAdapterPipeline, T2IAdapter
|
||||
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2iadapter_canny_sd15v2", torch_dtype=torch.float16)
|
||||
pipeline = StableDiffusionAdapterPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
adapter=adapter,
|
||||
torch_dtype=torch.float16,
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
```
|
||||
|
||||
Finally, pass your prompt and control image to the pipeline.
|
||||
|
||||
```py
|
||||
generator = torch.Generator("cuda").manual_seed(0)
|
||||
|
||||
image = pipeline(
|
||||
prompt="cinematic photo of a plush and soft midcentury style rug on a wooden floor, 35mm photograph, film, professional, 4k, highly detailed",
|
||||
image=image,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-sd1.5.png"/>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Stable Diffusion XL">
|
||||
|
||||
Create a canny image with the [controlnet-aux](https://github.com/huggingface/controlnet_aux) library.
|
||||
|
||||
```py
|
||||
from controlnet_aux.canny import CannyDetector
|
||||
from diffusers.utils import load_image
|
||||
|
||||
canny_detector = CannyDetector()
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/hf-logo.png")
|
||||
image = canny_detector(image, detect_resolution=384, image_resolution=1024)
|
||||
```
|
||||
|
||||
Now load a T2I-Adapter conditioned on [canny images](https://hf.co/TencentARC/t2i-adapter-canny-sdxl-1.0) and pass it
|
||||
to the [`StableDiffusionXLAdapterPipeline`].
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLAdapterPipeline, T2IAdapter, EulerAncestralDiscreteScheduler, AutoencoderKL
|
||||
|
||||
scheduler = EulerAncestralDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler")
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
|
||||
adapter = T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16)
|
||||
pipeline = StableDiffusionXLAdapterPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
adapter=adapter,
|
||||
adapter=t2i_adapter,
|
||||
vae=vae,
|
||||
scheduler=scheduler,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
```
|
||||
).to("cuda")
|
||||
|
||||
Finally, pass your prompt and control image to the pipeline.
|
||||
prompt = """
|
||||
A photorealistic overhead image of a cat reclining sideways in a flamingo pool floatie holding a margarita.
|
||||
The cat is floating leisurely in the pool and completely relaxed and happy.
|
||||
"""
|
||||
|
||||
```py
|
||||
generator = torch.Generator("cuda").manual_seed(0)
|
||||
|
||||
image = pipeline(
|
||||
prompt="cinematic photo of a plush and soft midcentury style rug on a wooden floor, 35mm photograph, film, professional, 4k, highly detailed",
|
||||
image=image,
|
||||
generator=generator,
|
||||
pipeline(
|
||||
prompt,
|
||||
image=canny_image,
|
||||
num_inference_steps=100,
|
||||
guidance_scale=10,
|
||||
).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-sdxl.png"/>
|
||||
<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/non-enhanced-prompt.png" width="300" alt="Generated image (prompt only)"/>
|
||||
<figcaption style="text-align: center;">original image</figcaption>
|
||||
</figure>
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png" width="300" alt="Control image (Canny edges)"/>
|
||||
<figcaption style="text-align: center;">canny image</figcaption>
|
||||
</figure>
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-canny-cat-generated.png" width="300" alt="Generated image (ControlNet + prompt)"/>
|
||||
<figcaption style="text-align: center;">generated image</figcaption>
|
||||
</figure>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## MultiAdapter
|
||||
|
||||
T2I-Adapters are also composable, allowing you to use more than one adapter to impose multiple control conditions on an
|
||||
image. For example, you can use a pose map to provide structural control and a depth map for depth control. This is
|
||||
enabled by the [`MultiAdapter`] class.
|
||||
You can compose multiple controls, such as canny image and a depth map, with the [`MultiAdapter`] class.
|
||||
|
||||
Let's condition a text-to-image model with a pose and depth adapter. Create and place your depth and pose image and in a list.
|
||||
The example below composes a canny image and depth map.
|
||||
|
||||
```py
|
||||
from diffusers.utils import load_image
|
||||
|
||||
pose_image = load_image(
|
||||
"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"
|
||||
)
|
||||
depth_image = load_image(
|
||||
"https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"
|
||||
)
|
||||
cond = [pose_image, depth_image]
|
||||
prompt = ["Santa Claus walking into an office room with a beautiful city view"]
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/depth_sample_input.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">depth image</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/diffusers/docs-images/resolve/main/t2i-adapter/keypose_sample_input.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">pose image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
Load the corresponding pose and depth adapters as a list in the [`MultiAdapter`] class.
|
||||
Load the control images and T2I-Adapters as a list.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionAdapterPipeline, MultiAdapter, T2IAdapter
|
||||
from diffusers.utils import load_image
|
||||
from diffusers import StableDiffusionXLAdapterPipeline, AutoencoderKL, MultiAdapter, T2IAdapter
|
||||
|
||||
canny_image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png"
|
||||
)
|
||||
depth_image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_depth_image.png"
|
||||
)
|
||||
controls = [canny_image, depth_image]
|
||||
prompt = ["""
|
||||
a relaxed rabbit sitting on a striped towel next to a pool with a tropical drink nearby,
|
||||
bright sunny day, vacation scene, 35mm photograph, film, professional, 4k, highly detailed
|
||||
"""]
|
||||
|
||||
adapters = MultiAdapter(
|
||||
[
|
||||
T2IAdapter.from_pretrained("TencentARC/t2iadapter_keypose_sd14v1"),
|
||||
T2IAdapter.from_pretrained("TencentARC/t2iadapter_depth_sd14v1"),
|
||||
T2IAdapter.from_pretrained("TencentARC/t2i-adapter-canny-sdxl-1.0", torch_dtype=torch.float16),
|
||||
T2IAdapter.from_pretrained("TencentARC/t2i-adapter-depth-midas-sdxl-1.0", torch_dtype=torch.float16),
|
||||
]
|
||||
)
|
||||
adapters = adapters.to(torch.float16)
|
||||
```
|
||||
|
||||
Finally, load a [`StableDiffusionAdapterPipeline`] with the adapters, and pass your prompt and conditioned images to
|
||||
it. Use the [`adapter_conditioning_scale`] to adjust the weight of each adapter on the image.
|
||||
Pass the adapters, prompt, and control images to [`StableDiffusionXLAdapterPipeline`]. Use the `adapter_conditioning_scale` parameter to determine how much weight to assign to each control.
|
||||
|
||||
```py
|
||||
pipeline = StableDiffusionAdapterPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16)
|
||||
pipeline = StableDiffusionXLAdapterPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
vae=vae,
|
||||
adapter=adapters,
|
||||
).to("cuda")
|
||||
|
||||
image = pipeline(prompt, cond, adapter_conditioning_scale=[0.7, 0.7]).images[0]
|
||||
image
|
||||
pipeline(
|
||||
prompt,
|
||||
image=controls,
|
||||
height=1024,
|
||||
width=1024,
|
||||
adapter_conditioning_scale=[0.7, 0.7]
|
||||
).images[0]
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-multi.png"/>
|
||||
<div style="display: flex; gap: 10px; justify-content: space-around; align-items: flex-end;">
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/canny-cat.png" width="300" alt="Generated image (prompt only)"/>
|
||||
<figcaption style="text-align: center;">canny image</figcaption>
|
||||
</figure>
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/sdxl_depth_image.png" width="300" alt="Control image (Canny edges)"/>
|
||||
<figcaption style="text-align: center;">depth map</figcaption>
|
||||
</figure>
|
||||
<figure>
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/t2i-multi-rabbit.png" width="300" alt="Generated image (ControlNet + prompt)"/>
|
||||
<figcaption style="text-align: center;">generated image</figcaption>
|
||||
</figure>
|
||||
</div>
|
||||
@@ -10,109 +10,56 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Textual inversion
|
||||
# Textual Inversion
|
||||
|
||||
[[open-in-colab]]
|
||||
[Textual Inversion](https://huggingface.co/papers/2208.01618) is a method for generating personalized images of a concept. It works by fine-tuning a models word embeddings on 3-5 images of the concept (for example, pixel art) that is associated with a unique token (`<sks>`). This allows you to use the `<sks>` token in your prompt to trigger the model to generate pixel art images.
|
||||
|
||||
The [`StableDiffusionPipeline`] supports textual inversion, a technique that enables a model like Stable Diffusion to learn a new concept from just a few sample images. This gives you more control over the generated images and allows you to tailor the model towards specific concepts. You can get started quickly with a collection of community created concepts in the [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer).
|
||||
|
||||
This guide will show you how to run inference with textual inversion using a pre-learned concept from the Stable Diffusion Conceptualizer. If you're interested in teaching a model new concepts with textual inversion, take a look at the [Textual Inversion](../training/text_inversion) training guide.
|
||||
|
||||
Import the necessary libraries:
|
||||
Textual Inversion weights are very lightweight and typically only a few KBs because they're only word embeddings. However, this also means the word embeddings need to be loaded after loading a model with [`~DiffusionPipeline.from_pretrained`].
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
from diffusers.utils import make_image_grid
|
||||
```
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
|
||||
## Stable Diffusion 1 and 2
|
||||
|
||||
Pick a Stable Diffusion checkpoint and a pre-learned concept from the [Stable Diffusion Conceptualizer](https://huggingface.co/spaces/sd-concepts-library/stable-diffusion-conceptualizer):
|
||||
|
||||
```py
|
||||
pretrained_model_name_or_path = "stable-diffusion-v1-5/stable-diffusion-v1-5"
|
||||
repo_id_embeds = "sd-concepts-library/cat-toy"
|
||||
```
|
||||
|
||||
Now you can load a pipeline, and pass the pre-learned concept to it:
|
||||
|
||||
```py
|
||||
pipeline = StableDiffusionPipeline.from_pretrained(
|
||||
pretrained_model_name_or_path, torch_dtype=torch.float16, use_safetensors=True
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
pipeline.load_textual_inversion(repo_id_embeds)
|
||||
```
|
||||
|
||||
Create a prompt with the pre-learned concept by using the special placeholder token `<cat-toy>`, and choose the number of samples and rows of images you'd like to generate:
|
||||
Load the word embeddings with [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] and include the unique token in the prompt to activate its generation.
|
||||
|
||||
```py
|
||||
prompt = "a grafitti in a favela wall with a <cat-toy> on it"
|
||||
|
||||
num_samples_per_row = 2
|
||||
num_rows = 2
|
||||
```
|
||||
|
||||
Then run the pipeline (feel free to adjust the parameters like `num_inference_steps` and `guidance_scale` to see how they affect image quality), save the generated images and visualize them with the helper function you created at the beginning:
|
||||
|
||||
```py
|
||||
all_images = []
|
||||
for _ in range(num_rows):
|
||||
images = pipeline(prompt, num_images_per_prompt=num_samples_per_row, num_inference_steps=50, guidance_scale=7.5).images
|
||||
all_images.extend(images)
|
||||
|
||||
grid = make_image_grid(all_images, num_rows, num_samples_per_row)
|
||||
grid
|
||||
pipeline.load_textual_inversion("sd-concepts-library/gta5-artwork")
|
||||
prompt = "A cute brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration, <gta5-artwork> style"
|
||||
pipeline(prompt).images[0]
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/textual_inversion_inference.png">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_txt_embed.png" />
|
||||
</div>
|
||||
|
||||
## Stable Diffusion XL
|
||||
Textual Inversion can also be trained to learn *negative embeddings* to steer generation away from unwanted characteristics such as "blurry" or "ugly". It is useful for improving image quality.
|
||||
|
||||
Stable Diffusion XL (SDXL) can also use textual inversion vectors for inference. In contrast to Stable Diffusion 1 and 2, SDXL has two text encoders so you'll need two textual inversion embeddings - one for each text encoder model.
|
||||
|
||||
Let's download the SDXL textual inversion embeddings and have a closer look at it's structure:
|
||||
EasyNegative is a widely used negative embedding that contains multiple learned negative concepts. Load the negative embeddings and specify the file name and token associated with the negative embeddings. Pass the token to `negative_prompt` in your pipeline to activate it.
|
||||
|
||||
```py
|
||||
from huggingface_hub import hf_hub_download
|
||||
from safetensors.torch import load_file
|
||||
|
||||
file = hf_hub_download("dn118/unaestheticXL", filename="unaestheticXLv31.safetensors")
|
||||
state_dict = load_file(file)
|
||||
state_dict
|
||||
```
|
||||
|
||||
```
|
||||
{'clip_g': tensor([[ 0.0077, -0.0112, 0.0065, ..., 0.0195, 0.0159, 0.0275],
|
||||
...,
|
||||
[-0.0170, 0.0213, 0.0143, ..., -0.0302, -0.0240, -0.0362]],
|
||||
'clip_l': tensor([[ 0.0023, 0.0192, 0.0213, ..., -0.0385, 0.0048, -0.0011],
|
||||
...,
|
||||
[ 0.0475, -0.0508, -0.0145, ..., 0.0070, -0.0089, -0.0163]],
|
||||
```
|
||||
|
||||
There are two tensors, `"clip_g"` and `"clip_l"`.
|
||||
`"clip_g"` corresponds to the bigger text encoder in SDXL and refers to
|
||||
`pipe.text_encoder_2` and `"clip_l"` refers to `pipe.text_encoder`.
|
||||
|
||||
Now you can load each tensor separately by passing them along with the correct text encoder and tokenizer
|
||||
to [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`]:
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
|
||||
pipe = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", variant="fp16", torch_dtype=torch.float16)
|
||||
pipe.to("cuda")
|
||||
|
||||
pipe.load_textual_inversion(state_dict["clip_g"], token="unaestheticXLv31", text_encoder=pipe.text_encoder_2, tokenizer=pipe.tokenizer_2)
|
||||
pipe.load_textual_inversion(state_dict["clip_l"], token="unaestheticXLv31", text_encoder=pipe.text_encoder, tokenizer=pipe.tokenizer)
|
||||
|
||||
# the embedding should be used as a negative embedding, so we pass it as a negative prompt
|
||||
generator = torch.Generator().manual_seed(33)
|
||||
image = pipe("a woman standing in front of a mountain", negative_prompt="unaestheticXLv31", generator=generator).images[0]
|
||||
image
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
pipeline.load_textual_inversion(
|
||||
"EvilEngine/easynegative",
|
||||
weight_name="easynegative.safetensors",
|
||||
token="easynegative"
|
||||
)
|
||||
prompt = "A cute brown bear eating a slice of pizza, stunning color scheme, masterpiece, illustration"
|
||||
negative_prompt = "easynegative"
|
||||
pipeline(prompt, negative_prompt).images[0]
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png" />
|
||||
</div>
|
||||
@@ -125,7 +125,7 @@ Now we'll simply specify the name of the dataset and caption column (in this cas
|
||||
```
|
||||
|
||||
You can also load a dataset straight from by specifying it's name in `dataset_name`.
|
||||
Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loadin your own caption dataset.
|
||||
Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loading your own caption dataset.
|
||||
|
||||
- **optimizer**: for this example, we'll use [prodigy](https://huggingface.co/blog/sdxl_lora_advanced_script#adaptive-optimizers) - an adaptive optimizer
|
||||
- **pivotal tuning**
|
||||
@@ -404,7 +404,7 @@ The advanced script now supports custom choice of U-net blocks to train during D
|
||||
> In light of this, we're introducing a new feature to the advanced script to allow for configurable U-net learned blocks.
|
||||
|
||||
**Usage**
|
||||
Configure LoRA learned U-net blocks adding a `lora_unet_blocks` flag, with a comma seperated string specifying the targeted blocks.
|
||||
Configure LoRA learned U-net blocks adding a `lora_unet_blocks` flag, with a comma separated string specifying the targeted blocks.
|
||||
e.g:
|
||||
```bash
|
||||
--lora_unet_blocks="unet.up_blocks.0.attentions.0,unet.up_blocks.0.attentions.1"
|
||||
|
||||
@@ -141,7 +141,7 @@ Now we'll simply specify the name of the dataset and caption column (in this cas
|
||||
```
|
||||
|
||||
You can also load a dataset straight from by specifying it's name in `dataset_name`.
|
||||
Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loadin your own caption dataset.
|
||||
Look [here](https://huggingface.co/blog/sdxl_lora_advanced_script#custom-captioning) for more info on creating/loading your own caption dataset.
|
||||
|
||||
- **optimizer**: for this example, we'll use [prodigy](https://huggingface.co/blog/sdxl_lora_advanced_script#adaptive-optimizers) - an adaptive optimizer
|
||||
- **pivotal tuning**
|
||||
|
||||
@@ -770,6 +770,15 @@ def parse_args(input_args=None):
|
||||
),
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
parser.add_argument(
|
||||
"--image_interpolation_mode",
|
||||
type=str,
|
||||
default="lanczos",
|
||||
choices=[
|
||||
f.lower() for f in dir(transforms.InterpolationMode) if not f.startswith("__") and not f.endswith("__")
|
||||
],
|
||||
help="The image interpolation method to use for resizing images.",
|
||||
)
|
||||
|
||||
if input_args is not None:
|
||||
args = parser.parse_args(input_args)
|
||||
@@ -1034,7 +1043,10 @@ class DreamBoothDataset(Dataset):
|
||||
self.instance_images.extend(itertools.repeat(img, repeats))
|
||||
|
||||
self.pixel_values = []
|
||||
train_resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)
|
||||
interpolation = getattr(transforms.InterpolationMode, args.image_interpolation_mode.upper(), None)
|
||||
if interpolation is None:
|
||||
raise ValueError(f"Unsupported interpolation mode {interpolation=}.")
|
||||
train_resize = transforms.Resize(size, interpolation=interpolation)
|
||||
train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size)
|
||||
train_flip = transforms.RandomHorizontalFlip(p=1.0)
|
||||
train_transforms = transforms.Compose(
|
||||
@@ -1078,7 +1090,7 @@ class DreamBoothDataset(Dataset):
|
||||
|
||||
self.image_transforms = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
|
||||
transforms.Resize(size, interpolation=interpolation),
|
||||
transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize([0.5], [0.5]),
|
||||
|
||||
@@ -673,6 +673,15 @@ def parse_args(input_args=None):
|
||||
default=False,
|
||||
help="Cache the VAE latents",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_interpolation_mode",
|
||||
type=str,
|
||||
default="lanczos",
|
||||
choices=[
|
||||
f.lower() for f in dir(transforms.InterpolationMode) if not f.startswith("__") and not f.endswith("__")
|
||||
],
|
||||
help="The image interpolation method to use for resizing images.",
|
||||
)
|
||||
|
||||
if input_args is not None:
|
||||
args = parser.parse_args(input_args)
|
||||
@@ -907,6 +916,10 @@ class DreamBoothDataset(Dataset):
|
||||
self.num_instance_images = len(self.instance_images)
|
||||
self._length = self.num_instance_images
|
||||
|
||||
interpolation = getattr(transforms.InterpolationMode, args.image_interpolation_mode.upper(), None)
|
||||
if interpolation is None:
|
||||
raise ValueError(f"Unsupported interpolation mode {interpolation=}.")
|
||||
|
||||
if class_data_root is not None:
|
||||
self.class_data_root = Path(class_data_root)
|
||||
self.class_data_root.mkdir(parents=True, exist_ok=True)
|
||||
@@ -921,7 +934,7 @@ class DreamBoothDataset(Dataset):
|
||||
|
||||
self.image_transforms = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
|
||||
transforms.Resize(size, interpolation=interpolation),
|
||||
transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize([0.5], [0.5]),
|
||||
|
||||
@@ -799,6 +799,15 @@ def parse_args(input_args=None):
|
||||
default=False,
|
||||
help="Cache the VAE latents",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_interpolation_mode",
|
||||
type=str,
|
||||
default="lanczos",
|
||||
choices=[
|
||||
f.lower() for f in dir(transforms.InterpolationMode) if not f.startswith("__") and not f.endswith("__")
|
||||
],
|
||||
help="The image interpolation method to use for resizing images.",
|
||||
)
|
||||
|
||||
if input_args is not None:
|
||||
args = parser.parse_args(input_args)
|
||||
@@ -1069,7 +1078,10 @@ class DreamBoothDataset(Dataset):
|
||||
self.original_sizes = []
|
||||
self.crop_top_lefts = []
|
||||
self.pixel_values = []
|
||||
train_resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)
|
||||
interpolation = getattr(transforms.InterpolationMode, args.image_interpolation_mode.upper(), None)
|
||||
if interpolation is None:
|
||||
raise ValueError(f"Unsupported interpolation mode {interpolation=}.")
|
||||
train_resize = transforms.Resize(size, interpolation=interpolation)
|
||||
train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size)
|
||||
train_flip = transforms.RandomHorizontalFlip(p=1.0)
|
||||
train_transforms = transforms.Compose(
|
||||
@@ -1146,7 +1158,7 @@ class DreamBoothDataset(Dataset):
|
||||
|
||||
self.image_transforms = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
|
||||
transforms.Resize(size, interpolation=interpolation),
|
||||
transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize([0.5], [0.5]),
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
## Amused training
|
||||
|
||||
Amused can be finetuned on simple datasets relatively cheaply and quickly. Using 8bit optimizers, lora, and gradient accumulation, amused can be finetuned with as little as 5.5 GB. Here are a set of examples for finetuning amused on some relatively simple datasets. These training recipies are aggressively oriented towards minimal resources and fast verification -- i.e. the batch sizes are quite low and the learning rates are quite high. For optimal quality, you will probably want to increase the batch sizes and decrease learning rates.
|
||||
Amused can be finetuned on simple datasets relatively cheaply and quickly. Using 8bit optimizers, lora, and gradient accumulation, amused can be finetuned with as little as 5.5 GB. Here are a set of examples for finetuning amused on some relatively simple datasets. These training recipes are aggressively oriented towards minimal resources and fast verification -- i.e. the batch sizes are quite low and the learning rates are quite high. For optimal quality, you will probably want to increase the batch sizes and decrease learning rates.
|
||||
|
||||
All training examples use fp16 mixed precision and gradient checkpointing. We don't show 8 bit adam + lora as its about the same memory use as just using lora (bitsandbytes uses full precision optimizer states for weights below a minimum size).
|
||||
|
||||
|
||||
@@ -201,7 +201,7 @@ Note that setting the `<ID_TOKEN>` is not necessary. From some limited experimen
|
||||
> - The original repository uses a `lora_alpha` of `1`. We found this not suitable in many runs, possibly due to difference in modeling backends and training settings. Our recommendation is to set to the `lora_alpha` to either `rank` or `rank // 2`.
|
||||
> - If you're training on data whose captions generate bad results with the original model, a `rank` of 64 and above is good and also the recommendation by the team behind CogVideoX. If the generations are already moderately good on your training captions, a `rank` of 16/32 should work. We found that setting the rank too low, say `4`, is not ideal and doesn't produce promising results.
|
||||
> - The authors of CogVideoX recommend 4000 training steps and 100 training videos overall to achieve the best result. While that might yield the best results, we found from our limited experimentation that 2000 steps and 25 videos could also be sufficient.
|
||||
> - When using the Prodigy opitimizer for training, one can follow the recommendations from [this](https://huggingface.co/blog/sdxl_lora_advanced_script) blog. Prodigy tends to overfit quickly. From my very limited testing, I found a learning rate of `0.5` to be suitable in addition to `--prodigy_use_bias_correction`, `prodigy_safeguard_warmup` and `--prodigy_decouple`.
|
||||
> - When using the Prodigy optimizer for training, one can follow the recommendations from [this](https://huggingface.co/blog/sdxl_lora_advanced_script) blog. Prodigy tends to overfit quickly. From my very limited testing, I found a learning rate of `0.5` to be suitable in addition to `--prodigy_use_bias_correction`, `prodigy_safeguard_warmup` and `--prodigy_decouple`.
|
||||
> - The recommended learning rate by the CogVideoX authors and from our experimentation with Adam/AdamW is between `1e-3` and `1e-4` for a dataset of 25+ videos.
|
||||
>
|
||||
> Note that our testing is not exhaustive due to limited time for exploration. Our recommendation would be to play around with the different knobs and dials to find the best settings for your data.
|
||||
|
||||
@@ -879,7 +879,7 @@ def prepare_rotary_positional_embeddings(
|
||||
|
||||
|
||||
def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
|
||||
# Use DeepSpeed optimzer
|
||||
# Use DeepSpeed optimizer
|
||||
if use_deepspeed:
|
||||
from accelerate.utils import DummyOptim
|
||||
|
||||
|
||||
@@ -901,7 +901,7 @@ def prepare_rotary_positional_embeddings(
|
||||
|
||||
|
||||
def get_optimizer(args, params_to_optimize, use_deepspeed: bool = False):
|
||||
# Use DeepSpeed optimzer
|
||||
# Use DeepSpeed optimizer
|
||||
if use_deepspeed:
|
||||
from accelerate.utils import DummyOptim
|
||||
|
||||
|
||||
@@ -86,6 +86,7 @@ PIXART-α Controlnet pipeline | Implementation of the controlnet model for pixar
|
||||
| Perturbed-Attention Guidance |StableDiffusionPAGPipeline is a modification of StableDiffusionPipeline to support Perturbed-Attention Guidance (PAG).|[Perturbed-Attention Guidance](#perturbed-attention-guidance)|[Notebook](https://github.com/huggingface/notebooks/blob/main/diffusers/perturbed_attention_guidance.ipynb)|[Hyoungwon Cho](https://github.com/HyoungwonCho)|
|
||||
| CogVideoX DDIM Inversion Pipeline | Implementation of DDIM inversion and guided attention-based editing denoising process on CogVideoX. | [CogVideoX DDIM Inversion Pipeline](#cogvideox-ddim-inversion-pipeline) | - | [LittleNyima](https://github.com/LittleNyima) |
|
||||
| FaithDiff Stable Diffusion XL Pipeline | Implementation of [(CVPR 2025) FaithDiff: Unleashing Diffusion Priors for Faithful Image Super-resolutionUnleashing Diffusion Priors for Faithful Image Super-resolution](https://arxiv.org/abs/2411.18824) - FaithDiff is a faithful image super-resolution method that leverages latent diffusion models by actively adapting the diffusion prior and jointly fine-tuning its components (encoder and diffusion model) with an alignment module to ensure high fidelity and structural consistency. | [FaithDiff Stable Diffusion XL Pipeline](#faithdiff-stable-diffusion-xl-pipeline) | [](https://huggingface.co/jychen9811/FaithDiff) | [Junyang Chen, Jinshan Pan, Jiangxin Dong, IMAG Lab, (Adapted by Eliseu Silva)](https://github.com/JyChen9811/FaithDiff) |
|
||||
| Stable Diffusion 3 InstructPix2Pix Pipeline | Implementation of Stable Diffusion 3 InstructPix2Pix Pipeline | [Stable Diffusion 3 InstructPix2Pix Pipeline](#stable-diffusion-3-instructpix2pix-pipeline) | [](https://huggingface.co/BleachNick/SD3_UltraEdit_freeform) [](https://huggingface.co/CaptainZZZ/sd3-instructpix2pix) | [Jiayu Zhang](https://github.com/xduzhangjiayu) and [Haozhe Zhao](https://github.com/HaozheZhao)|
|
||||
To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
|
||||
|
||||
```py
|
||||
@@ -4864,7 +4865,7 @@ python -m pip install intel_extension_for_pytorch
|
||||
```
|
||||
python -m pip install intel_extension_for_pytorch==<version_name> -f https://developer.intel.com/ipex-whl-stable-cpu
|
||||
```
|
||||
2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX accelaration. Supported inference datatypes are Float32 and BFloat16.
|
||||
2. After pipeline initialization, `prepare_for_ipex()` should be called to enable IPEX acceleration. Supported inference datatypes are Float32 and BFloat16.
|
||||
|
||||
```python
|
||||
pipe = AnimateDiffPipelineIpex.from_pretrained(base, motion_adapter=adapter, torch_dtype=dtype).to(device)
|
||||
@@ -5432,4 +5433,50 @@ cropped_image = gen_image.crop((0, 0, width_init, height_init))
|
||||
cropped_image.save("data/result.png")
|
||||
````
|
||||
### Result
|
||||
[<img src="https://huggingface.co/datasets/DEVAIEXP/assets/resolve/main/faithdiff_restored.PNG" width="512px" height="512px"/>](https://imgsli.com/MzY1NzE2)
|
||||
[<img src="https://huggingface.co/datasets/DEVAIEXP/assets/resolve/main/faithdiff_restored.PNG" width="512px" height="512px"/>](https://imgsli.com/MzY1NzE2)
|
||||
|
||||
|
||||
# Stable Diffusion 3 InstructPix2Pix Pipeline
|
||||
This the implementation of the Stable Diffusion 3 InstructPix2Pix Pipeline, based on the HuggingFace Diffusers.
|
||||
|
||||
## Example Usage
|
||||
This pipeline aims to edit image based on user's instruction by using SD3
|
||||
````py
|
||||
import torch
|
||||
from diffusers import SD3Transformer2DModel
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.utils import load_image
|
||||
|
||||
|
||||
resolution = 512
|
||||
image = load_image("https://hf.co/datasets/diffusers/diffusers-images-docs/resolve/main/mountain.png").resize(
|
||||
(resolution, resolution)
|
||||
)
|
||||
edit_instruction = "Turn sky into a sunny one"
|
||||
|
||||
|
||||
pipe = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-3-medium-diffusers", custom_pipeline="pipeline_stable_diffusion_3_instruct_pix2pix", torch_dtype=torch.float16).to('cuda')
|
||||
|
||||
pipe.transformer = SD3Transformer2DModel.from_pretrained("CaptainZZZ/sd3-instructpix2pix",torch_dtype=torch.float16).to('cuda')
|
||||
|
||||
edited_image = pipe(
|
||||
prompt=edit_instruction,
|
||||
image=image,
|
||||
height=resolution,
|
||||
width=resolution,
|
||||
guidance_scale=7.5,
|
||||
image_guidance_scale=1.5,
|
||||
num_inference_steps=30,
|
||||
).images[0]
|
||||
|
||||
edited_image.save("edited_image.png")
|
||||
````
|
||||
|Original|Edited|
|
||||
|---|---|
|
||||
||
|
||||
|
||||
### Note
|
||||
This model is trained on 512x512, so input size is better on 512x512.
|
||||
For better editing performance, please refer to this powerful model https://huggingface.co/BleachNick/SD3_UltraEdit_freeform and Paper "UltraEdit: Instruction-based Fine-Grained Image
|
||||
Editing at Scale", many thanks to their contribution!
|
||||
@@ -336,13 +336,13 @@ if __name__ == "__main__":
|
||||
expanded_kernel_width = np.ceil(kernel_width) + 2
|
||||
|
||||
# Determine a set of field_of_view for each each output position, these are the pixels in the input image
|
||||
# that the pixel in the output image 'sees'. We get a matrix whos horizontal dim is the output pixels (big) and the
|
||||
# that the pixel in the output image 'sees'. We get a matrix whose horizontal dim is the output pixels (big) and the
|
||||
# vertical dim is the pixels it 'sees' (kernel_size + 2)
|
||||
field_of_view = np.squeeze(
|
||||
np.int16(np.expand_dims(left_boundary, axis=1) + np.arange(expanded_kernel_width) - 1)
|
||||
)
|
||||
|
||||
# Assign weight to each pixel in the field of view. A matrix whos horizontal dim is the output pixels and the
|
||||
# Assign weight to each pixel in the field of view. A matrix whose horizontal dim is the output pixels and the
|
||||
# vertical dim is a list of weights matching to the pixel in the field of view (that are specified in
|
||||
# 'field_of_view')
|
||||
weights = fixed_kernel(1.0 * np.expand_dims(match_coordinates, axis=1) - field_of_view - 1)
|
||||
|
||||
@@ -201,16 +201,16 @@ class PAIntAAttnProcessor:
|
||||
# ================================================== #
|
||||
# We use a hack by running the code from the BasicTransformerBlock that is between Self and Cross attentions here
|
||||
# The other option would've been modifying the BasicTransformerBlock and adding this functionality here.
|
||||
# I assumed that changing the BasicTransformerBlock would have been a bigger deal and decided to use this hack isntead.
|
||||
# I assumed that changing the BasicTransformerBlock would have been a bigger deal and decided to use this hack instead.
|
||||
|
||||
# The SelfAttention block recieves the normalized latents from the BasicTransformerBlock,
|
||||
# The SelfAttention block receives the normalized latents from the BasicTransformerBlock,
|
||||
# But the residual of the output is the non-normalized version.
|
||||
# Therefore we unnormalize the input hidden state here
|
||||
unnormalized_input_hidden_states = (
|
||||
input_hidden_states + self.transformer_block.norm1.bias
|
||||
) * self.transformer_block.norm1.weight
|
||||
|
||||
# TODO: return if neccessary
|
||||
# TODO: return if necessary
|
||||
# if self.use_ada_layer_norm_zero:
|
||||
# attn_output = gate_msa.unsqueeze(1) * attn_output
|
||||
# elif self.use_ada_layer_norm_single:
|
||||
@@ -220,7 +220,7 @@ class PAIntAAttnProcessor:
|
||||
if transformer_hidden_states.ndim == 4:
|
||||
transformer_hidden_states = transformer_hidden_states.squeeze(1)
|
||||
|
||||
# TODO: return if neccessary
|
||||
# TODO: return if necessary
|
||||
# 2.5 GLIGEN Control
|
||||
# if gligen_kwargs is not None:
|
||||
# transformer_hidden_states = self.fuser(transformer_hidden_states, gligen_kwargs["objs"])
|
||||
@@ -266,7 +266,7 @@ class PAIntAAttnProcessor:
|
||||
) = cross_attention_input_hidden_states.chunk(2)
|
||||
|
||||
# Same split for the encoder_hidden_states i.e. the tokens
|
||||
# Since the SelfAttention processors don't get the encoder states as input, we inject them into the processor in the begining.
|
||||
# Since the SelfAttention processors don't get the encoder states as input, we inject them into the processor in the beginning.
|
||||
_encoder_hidden_states_unconditional, encoder_hidden_states_conditional = self.encoder_hidden_states.chunk(
|
||||
2
|
||||
)
|
||||
@@ -896,7 +896,7 @@ class StableDiffusionHDPainterPipeline(StableDiffusionInpaintPipeline):
|
||||
class GaussianSmoothing(nn.Module):
|
||||
"""
|
||||
Apply gaussian smoothing on a
|
||||
1d, 2d or 3d tensor. Filtering is performed seperately for each channel
|
||||
1d, 2d or 3d tensor. Filtering is performed separately for each channel
|
||||
in the input using a depthwise convolution.
|
||||
|
||||
Args:
|
||||
|
||||
@@ -161,7 +161,7 @@ class ImageToImageInpaintingPipeline(DiffusionPipeline):
|
||||
`Image`, or tensor representing an image batch which will be inpainted, *i.e.* parts of the image will
|
||||
be masked out with `mask_image` and repainted according to `prompt`.
|
||||
inner_image (`torch.Tensor` or `PIL.Image.Image`):
|
||||
`Image`, or tensor representing an image batch which will be overlayed onto `image`. Non-transparent
|
||||
`Image`, or tensor representing an image batch which will be overlaid onto `image`. Non-transparent
|
||||
regions of `inner_image` must fit inside white pixels in `mask_image`. Expects four channels, with
|
||||
the last channel representing the alpha channel, which will be used to blend `inner_image` with
|
||||
`image`. If not provided, it will be forcibly cast to RGBA.
|
||||
|
||||
@@ -647,7 +647,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
|
||||
return sample
|
||||
|
||||
def set_timesteps(
|
||||
self, stength, num_inference_steps: int, lcm_origin_steps: int, device: Union[str, torch.device] = None
|
||||
self, strength, num_inference_steps: int, lcm_origin_steps: int, device: Union[str, torch.device] = None
|
||||
):
|
||||
"""
|
||||
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
|
||||
@@ -668,7 +668,7 @@ class LCMSchedulerWithTimestamp(SchedulerMixin, ConfigMixin):
|
||||
# LCM Timesteps Setting: # Linear Spacing
|
||||
c = self.config.num_train_timesteps // lcm_origin_steps
|
||||
lcm_origin_timesteps = (
|
||||
np.asarray(list(range(1, int(lcm_origin_steps * stength) + 1))) * c - 1
|
||||
np.asarray(list(range(1, int(lcm_origin_steps * strength) + 1))) * c - 1
|
||||
) # LCM Training Steps Schedule
|
||||
skipping_step = len(lcm_origin_timesteps) // num_inference_steps
|
||||
timesteps = lcm_origin_timesteps[::-skipping_step][:num_inference_steps] # LCM Inference Steps Schedule
|
||||
|
||||
@@ -129,7 +129,7 @@ class MagicMixPipeline(DiffusionPipeline):
|
||||
|
||||
input = (
|
||||
(mix_factor * latents) + (1 - mix_factor) * orig_latents
|
||||
) # interpolating between layout noise and conditionally generated noise to preserve layout sematics
|
||||
) # interpolating between layout noise and conditionally generated noise to preserve layout semantics
|
||||
input = torch.cat([input] * 2)
|
||||
|
||||
else: # content generation phase
|
||||
|
||||
@@ -196,9 +196,9 @@ class StableDiffusionTilingPipeline(DiffusionPipeline, StableDiffusionExtrasMixi
|
||||
guidance_scale_tiles: specific weights for classifier-free guidance in each tile.
|
||||
guidance_scale_tiles: specific weights for classifier-free guidance in each tile. If None, the value provided in guidance_scale will be used.
|
||||
seed_tiles: specific seeds for the initialization latents in each tile. These will override the latents generated for the whole canvas using the standard seed parameter.
|
||||
seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overriden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overriden.
|
||||
seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overriden using the given seed. Takes priority over seed_tiles.
|
||||
cpu_vae: the decoder from latent space to pixel space can require too mucho GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues.
|
||||
seed_tiles_mode: either "full" "exclusive". If "full", all the latents affected by the tile be overridden. If "exclusive", only the latents that are affected exclusively by this tile (and no other tiles) will be overridden.
|
||||
seed_reroll_regions: a list of tuples in the form (start row, end row, start column, end column, seed) defining regions in pixel space for which the latents will be overridden using the given seed. Takes priority over seed_tiles.
|
||||
cpu_vae: the decoder from latent space to pixel space can require too much GPU RAM for large images. If you find out of memory errors at the end of the generation process, try setting this parameter to True to run the decoder in CPU. Slower, but should run without memory issues.
|
||||
|
||||
Examples:
|
||||
|
||||
|
||||
@@ -1258,7 +1258,7 @@ class KolorsControlNetPipeline(
|
||||
)
|
||||
|
||||
if guess_mode and self.do_classifier_free_guidance:
|
||||
# Infered ControlNet only for the conditional batch.
|
||||
# Inferred ControlNet only for the conditional batch.
|
||||
# To apply the output of ControlNet to both the unconditional and conditional batches,
|
||||
# add 0 to the unconditional batch to keep it unchanged.
|
||||
down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
|
||||
|
||||
@@ -1462,7 +1462,7 @@ class KolorsControlNetImg2ImgPipeline(
|
||||
)
|
||||
|
||||
if guess_mode and self.do_classifier_free_guidance:
|
||||
# Infered ControlNet only for the conditional batch.
|
||||
# Inferred ControlNet only for the conditional batch.
|
||||
# To apply the output of ControlNet to both the unconditional and conditional batches,
|
||||
# add 0 to the unconditional batch to keep it unchanged.
|
||||
down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
|
||||
|
||||
@@ -1782,7 +1782,7 @@ class KolorsControlNetInpaintPipeline(
|
||||
)
|
||||
|
||||
if guess_mode and self.do_classifier_free_guidance:
|
||||
# Infered ControlNet only for the conditional batch.
|
||||
# Inferred ControlNet only for the conditional batch.
|
||||
# To apply the output of ControlNet to both the unconditional and conditional batches,
|
||||
# add 0 to the unconditional batch to keep it unchanged.
|
||||
down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples]
|
||||
|
||||
@@ -559,7 +559,7 @@ class FabricPipeline(DiffusionPipeline):
|
||||
End point for providing feedback (between 0 and 1).
|
||||
min_weight (`float`, *optional*, defaults to `.05`):
|
||||
Minimum weight for feedback.
|
||||
max_weight (`float`, *optional*, defults tp `1.0`):
|
||||
max_weight (`float`, *optional*, defaults tp `1.0`):
|
||||
Maximum weight for feedback.
|
||||
neg_scale (`float`, *optional*, defaults to `.5`):
|
||||
Scale factor for negative feedback.
|
||||
|
||||
@@ -118,7 +118,7 @@ EXAMPLE_DOC_STRING = """
|
||||
>>> # Here we need use pipeline internal unet model
|
||||
>>> pipe.unet = pipe.unet_model.from_pretrained(model_id, subfolder="unet", variant="fp16", use_safetensors=True)
|
||||
>>>
|
||||
>>> # Load aditional layers to the model
|
||||
>>> # Load additional layers to the model
|
||||
>>> pipe.unet.load_additional_layers(weight_path="proc_data/faithdiff/FaithDiff.bin", dtype=dtype)
|
||||
>>>
|
||||
>>> # Enable vae tiling
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -72,7 +72,7 @@ class GaussianSmoothing(nn.Module):
|
||||
"""
|
||||
Copied from official repo: https://github.com/showlab/BoxDiff/blob/master/utils/gaussian_smoothing.py
|
||||
Apply gaussian smoothing on a
|
||||
1d, 2d or 3d tensor. Filtering is performed seperately for each channel
|
||||
1d, 2d or 3d tensor. Filtering is performed separately for each channel
|
||||
in the input using a depthwise convolution.
|
||||
Arguments:
|
||||
channels (int, sequence): Number of channels of the input tensors. Output will
|
||||
|
||||
@@ -1509,7 +1509,7 @@ class StableDiffusionXL_AE_Pipeline(
|
||||
|
||||
add_time_ids = add_time_ids.repeat(batch_size, 1).to(DEVICE)
|
||||
|
||||
# interative sampling
|
||||
# interactive sampling
|
||||
self.scheduler.set_timesteps(num_inference_steps)
|
||||
latents_list = [latents]
|
||||
pred_x0_list = []
|
||||
@@ -1548,7 +1548,7 @@ class StableDiffusionXL_AE_Pipeline(
|
||||
x: torch.FloatTensor,
|
||||
):
|
||||
"""
|
||||
predict the sampe the next step in the denoise process.
|
||||
predict the sample the next step in the denoise process.
|
||||
"""
|
||||
ref_noise = model_output[:1, :, :, :].expand(model_output.shape)
|
||||
alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
|
||||
|
||||
@@ -132,7 +132,7 @@ def _preprocess_adapter_image(image, height, width):
|
||||
image = torch.cat(image, dim=0)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but recive: {image[0].ndim}"
|
||||
f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but receive: {image[0].ndim}"
|
||||
)
|
||||
return image
|
||||
|
||||
|
||||
@@ -150,7 +150,7 @@ def _preprocess_adapter_image(image, height, width):
|
||||
image = torch.cat(image, dim=0)
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but recive: {image[0].ndim}"
|
||||
f"Invalid image tensor! Expecting image tensor with 3 or 4 dimension, but receive: {image[0].ndim}"
|
||||
)
|
||||
return image
|
||||
|
||||
|
||||
@@ -220,7 +220,7 @@ class RegionalPromptingStableDiffusionPipeline(StableDiffusionPipeline):
|
||||
revers = True
|
||||
|
||||
def pcallback(s_self, step: int, timestep: int, latents: torch.Tensor, selfs=None):
|
||||
if "PRO" in mode: # in Prompt mode, make masks from sum of attension maps
|
||||
if "PRO" in mode: # in Prompt mode, make masks from sum of attention maps
|
||||
self.step = step
|
||||
|
||||
if len(self.attnmaps_sizes) > 3:
|
||||
@@ -552,9 +552,9 @@ def get_attn_maps(self, attn):
|
||||
|
||||
def reset_attnmaps(self): # init parameters in every batch
|
||||
self.step = 0
|
||||
self.attnmaps = {} # maked from attention maps
|
||||
self.attnmaps = {} # made from attention maps
|
||||
self.attnmaps_sizes = [] # height,width set of u-net blocks
|
||||
self.attnmasks = {} # maked from attnmaps for regions
|
||||
self.attnmasks = {} # made from attnmaps for regions
|
||||
self.maskready = False
|
||||
self.history = {}
|
||||
|
||||
|
||||
@@ -97,7 +97,7 @@ class SdeDragPipeline(DiffusionPipeline):
|
||||
steps (`int`, *optional*, defaults to 200):
|
||||
The number of sampling iterations.
|
||||
step_size (`int`, *optional*, defaults to 2):
|
||||
The drag diatance of each drag step.
|
||||
The drag distance of each drag step.
|
||||
image_scale (`float`, *optional*, defaults to 0.3):
|
||||
To avoid duplicating the content, use image_scale to perturbs the source.
|
||||
adapt_radius (`int`, *optional*, defaults to 5):
|
||||
|
||||
@@ -284,7 +284,7 @@ class UnCLIPImageInterpolationPipeline(DiffusionPipeline):
|
||||
)
|
||||
else:
|
||||
raise AssertionError(
|
||||
f"Expected 'image' or 'image_embeddings' to be not None with types List[PIL.Image] or torch.Tensor respectively. Received {type(image)} and {type(image_embeddings)} repsectively"
|
||||
f"Expected 'image' or 'image_embeddings' to be not None with types List[PIL.Image] or torch.Tensor respectively. Received {type(image)} and {type(image_embeddings)} respectively"
|
||||
)
|
||||
|
||||
original_image_embeddings = self._encode_image(
|
||||
|
||||
@@ -1012,7 +1012,7 @@ def main(args):
|
||||
unet = get_peft_model(unet, lora_config)
|
||||
|
||||
# 9. Handle mixed precision and device placement
|
||||
# For mixed precision training we cast all non-trainable weigths to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@@ -829,7 +829,7 @@ def main(args):
|
||||
)
|
||||
|
||||
# 8. Handle mixed precision and device placement
|
||||
# For mixed precision training we cast all non-trainable weigths to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@@ -1026,7 +1026,7 @@ def main(args):
|
||||
unet = get_peft_model(unet, lora_config)
|
||||
|
||||
# 9. Handle mixed precision and device placement
|
||||
# For mixed precision training we cast all non-trainable weigths to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@@ -962,7 +962,7 @@ def main(args):
|
||||
)
|
||||
|
||||
# 9. Handle mixed precision and device placement
|
||||
# For mixed precision training we cast all non-trainable weigths to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@@ -1021,7 +1021,7 @@ def main(args):
|
||||
)
|
||||
|
||||
# 9. Handle mixed precision and device placement
|
||||
# For mixed precision training we cast all non-trainable weigths to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@@ -411,7 +411,7 @@ export CAPTION_COLUMN='caption_column'
|
||||
|
||||
export CACHE_DIR="/data/train_csr/.cache/huggingface/"
|
||||
export OUTPUT_DIR='/data/train_csr/FLUX/MODEL_OUT/'$MODEL_TYPE
|
||||
# The first step is to use Python to precompute all caches.Replace the first line below with this line. (I am not sure why using acclerate would cause problems.)
|
||||
# The first step is to use Python to precompute all caches.Replace the first line below with this line. (I am not sure why using accelerate would cause problems.)
|
||||
|
||||
CUDA_VISIBLE_DEVICES=0 python3 train_controlnet_flux.py \
|
||||
|
||||
|
||||
@@ -639,6 +639,15 @@ def parse_args(input_args=None):
|
||||
action="store_true",
|
||||
help="Enable model cpu offload and save memory.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_interpolation_mode",
|
||||
type=str,
|
||||
default="lanczos",
|
||||
choices=[
|
||||
f.lower() for f in dir(transforms.InterpolationMode) if not f.startswith("__") and not f.endswith("__")
|
||||
],
|
||||
help="The image interpolation method to use for resizing images.",
|
||||
)
|
||||
|
||||
if input_args is not None:
|
||||
args = parser.parse_args(input_args)
|
||||
@@ -736,9 +745,13 @@ def get_train_dataset(args, accelerator):
|
||||
|
||||
|
||||
def prepare_train_dataset(dataset, accelerator):
|
||||
interpolation = getattr(transforms.InterpolationMode, args.image_interpolation_mode.upper(), None)
|
||||
if interpolation is None:
|
||||
raise ValueError(f"Unsupported interpolation mode {interpolation=}.")
|
||||
|
||||
image_transforms = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
|
||||
transforms.Resize(args.resolution, interpolation=interpolation),
|
||||
transforms.CenterCrop(args.resolution),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize([0.5], [0.5]),
|
||||
@@ -747,7 +760,7 @@ def prepare_train_dataset(dataset, accelerator):
|
||||
|
||||
conditioning_image_transforms = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
|
||||
transforms.Resize(args.resolution, interpolation=interpolation),
|
||||
transforms.CenterCrop(args.resolution),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize([0.5], [0.5]),
|
||||
|
||||
@@ -134,7 +134,25 @@ def log_validation(vae, unet, controlnet, args, accelerator, weight_dtype, step,
|
||||
|
||||
for validation_prompt, validation_image in zip(validation_prompts, validation_images):
|
||||
validation_image = Image.open(validation_image).convert("RGB")
|
||||
validation_image = validation_image.resize((args.resolution, args.resolution))
|
||||
|
||||
try:
|
||||
interpolation = getattr(transforms.InterpolationMode, args.image_interpolation_mode.upper())
|
||||
except (AttributeError, KeyError):
|
||||
supported_interpolation_modes = [
|
||||
f.lower() for f in dir(transforms.InterpolationMode) if not f.startswith("__") and not f.endswith("__")
|
||||
]
|
||||
raise ValueError(
|
||||
f"Interpolation mode {args.image_interpolation_mode} is not supported. "
|
||||
f"Please select one of the following: {', '.join(supported_interpolation_modes)}"
|
||||
)
|
||||
|
||||
transform = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(args.resolution, interpolation=interpolation),
|
||||
transforms.CenterCrop(args.resolution),
|
||||
]
|
||||
)
|
||||
validation_image = transform(validation_image)
|
||||
|
||||
images = []
|
||||
|
||||
@@ -587,6 +605,15 @@ def parse_args(input_args=None):
|
||||
" more information see https://huggingface.co/docs/accelerate/v0.17.0/en/package_reference/accelerator#accelerate.Accelerator"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_interpolation_mode",
|
||||
type=str,
|
||||
default="lanczos",
|
||||
choices=[
|
||||
f.lower() for f in dir(transforms.InterpolationMode) if not f.startswith("__") and not f.endswith("__")
|
||||
],
|
||||
help="The image interpolation method to use for resizing images.",
|
||||
)
|
||||
|
||||
if input_args is not None:
|
||||
args = parser.parse_args(input_args)
|
||||
@@ -732,9 +759,20 @@ def encode_prompt(prompt_batch, text_encoders, tokenizers, proportion_empty_prom
|
||||
|
||||
|
||||
def prepare_train_dataset(dataset, accelerator):
|
||||
try:
|
||||
interpolation_mode = getattr(transforms.InterpolationMode, args.image_interpolation_mode.upper())
|
||||
except (AttributeError, KeyError):
|
||||
supported_interpolation_modes = [
|
||||
f.lower() for f in dir(transforms.InterpolationMode) if not f.startswith("__") and not f.endswith("__")
|
||||
]
|
||||
raise ValueError(
|
||||
f"Interpolation mode {args.image_interpolation_mode} is not supported. "
|
||||
f"Please select one of the following: {', '.join(supported_interpolation_modes)}"
|
||||
)
|
||||
|
||||
image_transforms = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
|
||||
transforms.Resize(args.resolution, interpolation=interpolation_mode),
|
||||
transforms.CenterCrop(args.resolution),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize([0.5], [0.5]),
|
||||
@@ -743,7 +781,7 @@ def prepare_train_dataset(dataset, accelerator):
|
||||
|
||||
conditioning_image_transforms = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR),
|
||||
transforms.Resize(args.resolution, interpolation=interpolation_mode),
|
||||
transforms.CenterCrop(args.resolution),
|
||||
transforms.ToTensor(),
|
||||
]
|
||||
|
||||
@@ -173,13 +173,13 @@ accelerate launch train_dreambooth_lora_flux.py \
|
||||
### Target Modules
|
||||
When LoRA was first adapted from language models to diffusion models, it was applied to the cross-attention layers in the Unet that relate the image representations with the prompts that describe them.
|
||||
More recently, SOTA text-to-image diffusion models replaced the Unet with a diffusion Transformer(DiT). With this change, we may also want to explore
|
||||
applying LoRA training onto different types of layers and blocks. To allow more flexibility and control over the targeted modules we added `--lora_layers`- in which you can specify in a comma seperated string
|
||||
applying LoRA training onto different types of layers and blocks. To allow more flexibility and control over the targeted modules we added `--lora_layers`- in which you can specify in a comma separated string
|
||||
the exact modules for LoRA training. Here are some examples of target modules you can provide:
|
||||
- for attention only layers: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0"`
|
||||
- to train the same modules as in the fal trainer: `--lora_layers="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2"`
|
||||
- to train the same modules as in ostris ai-toolkit / replicate trainer: `--lora_blocks="attn.to_k,attn.to_q,attn.to_v,attn.to_out.0,attn.add_k_proj,attn.add_q_proj,attn.add_v_proj,attn.to_add_out,ff.net.0.proj,ff.net.2,ff_context.net.0.proj,ff_context.net.2,norm1_context.linear, norm1.linear,norm.linear,proj_mlp,proj_out"`
|
||||
> [!NOTE]
|
||||
> `--lora_layers` can also be used to specify which **blocks** to apply LoRA training to. To do so, simply add a block prefix to each layer in the comma seperated string:
|
||||
> `--lora_layers` can also be used to specify which **blocks** to apply LoRA training to. To do so, simply add a block prefix to each layer in the comma separated string:
|
||||
> **single DiT blocks**: to target the ith single transformer block, add the prefix `single_transformer_blocks.i`, e.g. - `single_transformer_blocks.i.attn.to_k`
|
||||
> **MMDiT blocks**: to target the ith MMDiT block, add the prefix `transformer_blocks.i`, e.g. - `transformer_blocks.i.attn.to_k`
|
||||
> [!NOTE]
|
||||
|
||||
@@ -107,7 +107,7 @@ To better track our training experiments, we're using the following flags in the
|
||||
|
||||
Additionally, we welcome you to explore the following CLI arguments:
|
||||
|
||||
* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
|
||||
* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
|
||||
* `--rank`: The rank of the LoRA layers. The higher the rank, the more parameters are trained. The default is 16.
|
||||
|
||||
We provide several options for optimizing memory optimization:
|
||||
@@ -117,3 +117,30 @@ We provide several options for optimizing memory optimization:
|
||||
* `--use_8bit_adam`: When enabled, we will use the 8bit version of AdamW provided by the `bitsandbytes` library.
|
||||
|
||||
Refer to the [official documentation](https://huggingface.co/docs/diffusers/main/en/api/pipelines/) of the `HiDreamImagePipeline` to know more about the model.
|
||||
|
||||
## Using quantization
|
||||
|
||||
You can quantize the base model with [`bitsandbytes`](https://huggingface.co/docs/bitsandbytes/index) to reduce memory usage. To do so, pass a JSON file path to `--bnb_quantization_config_path`. This file should hold the configuration to initialize `BitsAndBytesConfig`. Below is an example JSON file:
|
||||
|
||||
```json
|
||||
{
|
||||
"load_in_4bit": true,
|
||||
"bnb_4bit_quant_type": "nf4"
|
||||
}
|
||||
```
|
||||
|
||||
Below, we provide some numbers with and without the use of NF4 quantization when training:
|
||||
|
||||
```
|
||||
(with quantization)
|
||||
Memory (before device placement): 9.085089683532715 GB.
|
||||
Memory (after device placement): 34.59585428237915 GB.
|
||||
Memory (after backward): 36.90267467498779 GB.
|
||||
|
||||
(without quantization)
|
||||
Memory (before device placement): 0.0 GB.
|
||||
Memory (after device placement): 57.6400408744812 GB.
|
||||
Memory (after backward): 59.932212829589844 GB.
|
||||
```
|
||||
|
||||
The reason why we see some memory before device placement in the case of quantization is because, by default bnb quantized models are placed on the GPU first.
|
||||
@@ -113,7 +113,7 @@ To better track our training experiments, we're using the following flags in the
|
||||
|
||||
Additionally, we welcome you to explore the following CLI arguments:
|
||||
|
||||
* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
|
||||
* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
|
||||
* `--system_prompt`: A custom system prompt to provide additional personality to the model.
|
||||
* `--max_sequence_length`: Maximum sequence length to use for text embeddings.
|
||||
|
||||
|
||||
@@ -113,7 +113,7 @@ To better track our training experiments, we're using the following flags in the
|
||||
|
||||
Additionally, we welcome you to explore the following CLI arguments:
|
||||
|
||||
* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
|
||||
* `--lora_layers`: The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only.
|
||||
* `--complex_human_instruction`: Instructions for complex human attention as shown in [here](https://github.com/NVlabs/Sana/blob/main/configs/sana_app_config/Sana_1600M_app.yaml#L55).
|
||||
* `--max_sequence_length`: Maximum sequence length to use for text embeddings.
|
||||
|
||||
|
||||
@@ -567,7 +567,7 @@ def parse_args(input_args=None):
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
import argparse
|
||||
import copy
|
||||
import itertools
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
@@ -27,14 +28,13 @@ from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.utils.checkpoint
|
||||
import transformers
|
||||
from accelerate import Accelerator
|
||||
from accelerate.logging import get_logger
|
||||
from accelerate.utils import DistributedDataParallelKwargs, ProjectConfiguration, set_seed
|
||||
from huggingface_hub import create_repo, upload_folder
|
||||
from huggingface_hub.utils import insecure_hashlib
|
||||
from peft import LoraConfig, set_peft_model_state_dict
|
||||
from peft import LoraConfig, prepare_model_for_kbit_training, set_peft_model_state_dict
|
||||
from peft.utils import get_peft_model_state_dict
|
||||
from PIL import Image
|
||||
from PIL.ImageOps import exif_transpose
|
||||
@@ -47,6 +47,7 @@ from transformers import AutoTokenizer, CLIPTokenizer, LlamaForCausalLM, Pretrai
|
||||
import diffusers
|
||||
from diffusers import (
|
||||
AutoencoderKL,
|
||||
BitsAndBytesConfig,
|
||||
FlowMatchEulerDiscreteScheduler,
|
||||
HiDreamImagePipeline,
|
||||
HiDreamImageTransformer2DModel,
|
||||
@@ -282,6 +283,12 @@ def parse_args(input_args=None):
|
||||
default="meta-llama/Meta-Llama-3.1-8B-Instruct",
|
||||
help="Path to pretrained model or model identifier from huggingface.co/models.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bnb_quantization_config_path",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Quantization config in a JSON file that will be used to define the bitsandbytes quant config of the DiT.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--revision",
|
||||
type=str,
|
||||
@@ -596,7 +603,7 @@ def parse_args(input_args=None):
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
|
||||
),
|
||||
)
|
||||
|
||||
@@ -1056,6 +1063,14 @@ def main(args):
|
||||
args.pretrained_model_name_or_path, args.revision, subfolder="text_encoder_3"
|
||||
)
|
||||
|
||||
# For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
weight_dtype = torch.float16
|
||||
elif accelerator.mixed_precision == "bf16":
|
||||
weight_dtype = torch.bfloat16
|
||||
|
||||
# Load scheduler and models
|
||||
noise_scheduler = FlowMatchEulerDiscreteScheduler.from_pretrained(
|
||||
args.pretrained_model_name_or_path, subfolder="scheduler", revision=args.revision, shift=3.0
|
||||
@@ -1064,20 +1079,31 @@ def main(args):
|
||||
text_encoder_one, text_encoder_two, text_encoder_three, text_encoder_four = load_text_encoders(
|
||||
text_encoder_cls_one, text_encoder_cls_two, text_encoder_cls_three
|
||||
)
|
||||
|
||||
vae = AutoencoderKL.from_pretrained(
|
||||
args.pretrained_model_name_or_path,
|
||||
subfolder="vae",
|
||||
revision=args.revision,
|
||||
variant=args.variant,
|
||||
)
|
||||
quantization_config = None
|
||||
if args.bnb_quantization_config_path is not None:
|
||||
with open(args.bnb_quantization_config_path, "r") as f:
|
||||
config_kwargs = json.load(f)
|
||||
if "load_in_4bit" in config_kwargs and config_kwargs["load_in_4bit"]:
|
||||
config_kwargs["bnb_4bit_compute_dtype"] = weight_dtype
|
||||
quantization_config = BitsAndBytesConfig(**config_kwargs)
|
||||
|
||||
transformer = HiDreamImageTransformer2DModel.from_pretrained(
|
||||
args.pretrained_model_name_or_path,
|
||||
subfolder="transformer",
|
||||
revision=args.revision,
|
||||
variant=args.variant,
|
||||
quantization_config=quantization_config,
|
||||
torch_dtype=weight_dtype,
|
||||
force_inference_output=True,
|
||||
)
|
||||
if args.bnb_quantization_config_path is not None:
|
||||
transformer = prepare_model_for_kbit_training(transformer, use_gradient_checkpointing=False)
|
||||
|
||||
# We only train the additional adapter LoRA layers
|
||||
transformer.requires_grad_(False)
|
||||
@@ -1087,14 +1113,6 @@ def main(args):
|
||||
text_encoder_three.requires_grad_(False)
|
||||
text_encoder_four.requires_grad_(False)
|
||||
|
||||
# For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
weight_dtype = torch.float16
|
||||
elif accelerator.mixed_precision == "bf16":
|
||||
weight_dtype = torch.bfloat16
|
||||
|
||||
if torch.backends.mps.is_available() and weight_dtype == torch.bfloat16:
|
||||
# due to pytorch#99272, MPS does not yet support bfloat16.
|
||||
raise ValueError(
|
||||
@@ -1109,7 +1127,12 @@ def main(args):
|
||||
text_encoder_three.to(**to_kwargs)
|
||||
text_encoder_four.to(**to_kwargs)
|
||||
# we never offload the transformer to CPU, so we can just use the accelerator device
|
||||
transformer.to(accelerator.device, dtype=weight_dtype)
|
||||
transformer_to_kwargs = (
|
||||
{"device": accelerator.device}
|
||||
if args.bnb_quantization_config_path is not None
|
||||
else {"device": accelerator.device, "dtype": weight_dtype}
|
||||
)
|
||||
transformer.to(**transformer_to_kwargs)
|
||||
|
||||
# Initialize a text encoding pipeline and keep it to CPU for now.
|
||||
text_encoding_pipeline = HiDreamImagePipeline.from_pretrained(
|
||||
@@ -1695,10 +1718,11 @@ def main(args):
|
||||
accelerator.wait_for_everyone()
|
||||
if accelerator.is_main_process:
|
||||
transformer = unwrap_model(transformer)
|
||||
if args.upcast_before_saving:
|
||||
transformer.to(torch.float32)
|
||||
else:
|
||||
transformer = transformer.to(weight_dtype)
|
||||
if args.bnb_quantization_config_path is None:
|
||||
if args.upcast_before_saving:
|
||||
transformer.to(torch.float32)
|
||||
else:
|
||||
transformer = transformer.to(weight_dtype)
|
||||
transformer_lora_layers = get_peft_model_state_dict(transformer)
|
||||
|
||||
HiDreamImagePipeline.save_lora_weights(
|
||||
|
||||
@@ -514,7 +514,7 @@ def parse_args(input_args=None):
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
|
||||
),
|
||||
)
|
||||
|
||||
@@ -599,6 +599,15 @@ def parse_args(input_args=None):
|
||||
"Defaults to precision dtype used for training to save memory"
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_interpolation_mode",
|
||||
type=str,
|
||||
default="lanczos",
|
||||
choices=[
|
||||
f.lower() for f in dir(transforms.InterpolationMode) if not f.startswith("__") and not f.endswith("__")
|
||||
],
|
||||
help="The image interpolation method to use for resizing images.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--offload",
|
||||
action="store_true",
|
||||
@@ -724,7 +733,11 @@ class DreamBoothDataset(Dataset):
|
||||
self.instance_images.extend(itertools.repeat(img, repeats))
|
||||
|
||||
self.pixel_values = []
|
||||
train_resize = transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR)
|
||||
interpolation = getattr(transforms.InterpolationMode, args.image_interpolation_mode.upper(), None)
|
||||
if interpolation is None:
|
||||
raise ValueError(f"Unsupported interpolation mode: {args.image_interpolation_mode}")
|
||||
|
||||
train_resize = transforms.Resize(size, interpolation=interpolation)
|
||||
train_crop = transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size)
|
||||
train_flip = transforms.RandomHorizontalFlip(p=1.0)
|
||||
train_transforms = transforms.Compose(
|
||||
@@ -768,7 +781,7 @@ class DreamBoothDataset(Dataset):
|
||||
|
||||
self.image_transforms = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
|
||||
transforms.Resize(size, interpolation=interpolation),
|
||||
transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize([0.5], [0.5]),
|
||||
|
||||
@@ -513,7 +513,7 @@ def parse_args(input_args=None):
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v" will result in lora training of attention layers only'
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@@ -576,7 +576,7 @@ def parse_args(input_args=None):
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"The transformer block layers to apply LoRA training on. Please specify the layers in a comma seperated string."
|
||||
"The transformer block layers to apply LoRA training on. Please specify the layers in a comma separated string."
|
||||
"For examples refer to https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_SD3.md"
|
||||
),
|
||||
)
|
||||
@@ -585,7 +585,7 @@ def parse_args(input_args=None):
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
"The transformer blocks to apply LoRA training on. Please specify the block numbers in a comma seperated manner."
|
||||
"The transformer blocks to apply LoRA training on. Please specify the block numbers in a comma separated manner."
|
||||
'E.g. - "--lora_blocks 12,30" will result in lora training of transformer blocks 12 and 30. For more examples refer to https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/README_SD3.md'
|
||||
),
|
||||
)
|
||||
|
||||
@@ -664,7 +664,7 @@ def parse_args(input_args=None):
|
||||
action="store_true",
|
||||
default=False,
|
||||
help=(
|
||||
"Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
|
||||
"Whether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
|
||||
"Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
|
||||
),
|
||||
)
|
||||
@@ -852,7 +852,7 @@ class DreamBoothDataset(Dataset):
|
||||
|
||||
self.image_transforms = transforms.Compose(
|
||||
[
|
||||
transforms.Resize(size, interpolation=transforms.InterpolationMode.BILINEAR),
|
||||
transforms.Resize(size, interpolation=interpolation),
|
||||
transforms.CenterCrop(size) if center_crop else transforms.RandomCrop(size),
|
||||
transforms.ToTensor(),
|
||||
transforms.Normalize([0.5], [0.5]),
|
||||
|
||||
@@ -329,7 +329,7 @@ def parse_args(input_args=None):
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma seperated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
|
||||
'The transformer modules to apply LoRA training on. Please specify the layers in a comma separated. E.g. - "to_k,to_q,to_v,to_out.0" will result in lora training of attention layers only'
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
|
||||
@@ -400,7 +400,7 @@ def main():
|
||||
|
||||
image_encoder.requires_grad_(False)
|
||||
|
||||
# For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@@ -1147,7 +1147,7 @@ def main(args):
|
||||
tracker_config = dict(vars(args))
|
||||
accelerator.init_trackers(args.tracker_project_name, config=tracker_config)
|
||||
|
||||
# Function for unwraping if torch.compile() was used in accelerate.
|
||||
# Function for unwrapping if torch.compile() was used in accelerate.
|
||||
def unwrap_model(model):
|
||||
model = accelerator.unwrap_model(model)
|
||||
model = model._orig_mod if is_compiled_module(model) else model
|
||||
|
||||
@@ -69,7 +69,7 @@ accelerate launch --config_file=accelerate.yaml \
|
||||
--seed="0"
|
||||
```
|
||||
|
||||
We can direcly pass a quantized checkpoint path, too:
|
||||
We can directly pass a quantized checkpoint path, too:
|
||||
|
||||
```diff
|
||||
+ --quantized_model_path="hf-internal-testing/flux.1-dev-nf4-pkg"
|
||||
|
||||
@@ -13,7 +13,7 @@ args = parser.parse_args()
|
||||
|
||||
|
||||
device = "cpu"
|
||||
prompt = "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brighly buildings"
|
||||
prompt = "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brightly buildings"
|
||||
|
||||
model_id = "path-to-your-trained-model"
|
||||
pipe = StableDiffusionPipeline.from_pretrained(model_id)
|
||||
|
||||
@@ -80,7 +80,7 @@ export INT8_MODEL_NAME="./int8_model"
|
||||
|
||||
python text2images.py \
|
||||
--pretrained_model_name_or_path=$INT8_MODEL_NAME \
|
||||
--caption "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brighly buildings." \
|
||||
--caption "a lovely <dicoo> in red dress and hat, in the snowly and brightly night, with many brightly buildings." \
|
||||
--images_num 4
|
||||
```
|
||||
|
||||
|
||||
@@ -664,7 +664,7 @@ class PixArtAlphaControlnetPipeline(DiffusionPipeline):
|
||||
# &
|
||||
caption = re.sub(r"&", "", caption)
|
||||
|
||||
# ip adresses:
|
||||
# ip addresses:
|
||||
caption = re.sub(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", " ", caption)
|
||||
|
||||
# article ids:
|
||||
|
||||
@@ -612,7 +612,7 @@ def main():
|
||||
# See Section 3.1. of the paper.
|
||||
max_length = 120
|
||||
|
||||
# For mixed precision training we cast all non-trainable weigths (vae, text_encoder) to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights (vae, text_encoder) to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@@ -120,11 +120,11 @@ if __name__ == "__main__":
|
||||
parser.add_argument("--schnell", action="store_true", help="run flux schnell instead of dev")
|
||||
parser.add_argument("--width", type=int, default=1024, help="width of the image to generate")
|
||||
parser.add_argument("--height", type=int, default=1024, help="height of the image to generate")
|
||||
parser.add_argument("--guidance", type=float, default=3.5, help="gauidance strentgh for dev")
|
||||
parser.add_argument("--guidance", type=float, default=3.5, help="guidance strength for dev")
|
||||
parser.add_argument("--seed", type=int, default=None, help="seed for inference")
|
||||
parser.add_argument("--profile", action="store_true", help="enable profiling")
|
||||
parser.add_argument("--profile-duration", type=int, default=10000, help="duration for profiling in msec.")
|
||||
parser.add_argument("--itters", type=int, default=15, help="tiems to run inference and get avg time in sec.")
|
||||
parser.add_argument("--itters", type=int, default=15, help="items to run inference and get avg time in sec.")
|
||||
args = parser.parse_args()
|
||||
if args.schnell:
|
||||
ckpt_id = "black-forest-labs/FLUX.1-schnell"
|
||||
|
||||
@@ -759,7 +759,7 @@ def main(args):
|
||||
unet, text_encoder, optimizer, train_dataloader
|
||||
)
|
||||
|
||||
# For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
+1
-1
@@ -661,7 +661,7 @@ def parse_args(input_args=None):
|
||||
action="store_true",
|
||||
default=False,
|
||||
help=(
|
||||
"Wether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
|
||||
"Whether to train a DoRA as proposed in- DoRA: Weight-Decomposed Low-Rank Adaptation https://arxiv.org/abs/2402.09353. "
|
||||
"Note: to use DoRA you need to install peft from main, `pip install git+https://github.com/huggingface/peft.git`"
|
||||
),
|
||||
)
|
||||
|
||||
@@ -480,6 +480,15 @@ def parse_args(input_args=None):
|
||||
action="store_true",
|
||||
help="debug loss for each image, if filenames are available in the dataset",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--image_interpolation_mode",
|
||||
type=str,
|
||||
default="lanczos",
|
||||
choices=[
|
||||
f.lower() for f in dir(transforms.InterpolationMode) if not f.startswith("__") and not f.endswith("__")
|
||||
],
|
||||
help="The image interpolation method to use for resizing images.",
|
||||
)
|
||||
|
||||
if input_args is not None:
|
||||
args = parser.parse_args(input_args)
|
||||
@@ -913,8 +922,14 @@ def main(args):
|
||||
tokens_two = tokenize_prompt(tokenizer_two, captions)
|
||||
return tokens_one, tokens_two
|
||||
|
||||
# Get the specified interpolation method from the args
|
||||
interpolation = getattr(transforms.InterpolationMode, args.image_interpolation_mode.upper(), None)
|
||||
|
||||
# Raise an error if the interpolation method is invalid
|
||||
if interpolation is None:
|
||||
raise ValueError(f"Unsupported interpolation mode {args.image_interpolation_mode}.")
|
||||
# Preprocessing the datasets.
|
||||
train_resize = transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR)
|
||||
train_resize = transforms.Resize(args.resolution, interpolation=interpolation) # Use dynamic interpolation method
|
||||
train_crop = transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution)
|
||||
train_flip = transforms.RandomHorizontalFlip(p=1.0)
|
||||
train_transforms = transforms.Compose(
|
||||
|
||||
@@ -470,6 +470,15 @@ def parse_args(input_args=None):
|
||||
"--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
|
||||
)
|
||||
parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")
|
||||
parser.add_argument(
|
||||
"--image_interpolation_mode",
|
||||
type=str,
|
||||
default="lanczos",
|
||||
choices=[
|
||||
f.lower() for f in dir(transforms.InterpolationMode) if not f.startswith("__") and not f.endswith("__")
|
||||
],
|
||||
help="The image interpolation method to use for resizing images.",
|
||||
)
|
||||
|
||||
if input_args is not None:
|
||||
args = parser.parse_args(input_args)
|
||||
@@ -861,7 +870,10 @@ def main(args):
|
||||
)
|
||||
|
||||
# Preprocessing the datasets.
|
||||
train_resize = transforms.Resize(args.resolution, interpolation=transforms.InterpolationMode.BILINEAR)
|
||||
interpolation = getattr(transforms.InterpolationMode, args.image_interpolation_mode.upper(), None)
|
||||
if interpolation is None:
|
||||
raise ValueError(f"Unsupported interpolation mode {interpolation=}.")
|
||||
train_resize = transforms.Resize(args.resolution, interpolation=interpolation)
|
||||
train_crop = transforms.CenterCrop(args.resolution) if args.center_crop else transforms.RandomCrop(args.resolution)
|
||||
train_flip = transforms.RandomHorizontalFlip(p=1.0)
|
||||
train_transforms = transforms.Compose([transforms.ToTensor(), transforms.Normalize([0.5], [0.5])])
|
||||
|
||||
@@ -789,7 +789,7 @@ def main():
|
||||
text_encoder, optimizer, train_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
# For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@@ -814,7 +814,7 @@ def main():
|
||||
text_encoder_1, text_encoder_2, optimizer, train_dataloader, lr_scheduler
|
||||
)
|
||||
|
||||
# For mixed precision training we cast all non-trainable weigths (vae, non-lora text_encoder and non-lora unet) to half-precision
|
||||
# For mixed precision training we cast all non-trainable weights (vae, non-lora text_encoder and non-lora unet) to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
if accelerator.mixed_precision == "fp16":
|
||||
|
||||
@@ -220,7 +220,7 @@ def convert_flux_transformer_checkpoint_to_diffusers(
|
||||
f"double_blocks.{i}.txt_attn.proj.bias"
|
||||
)
|
||||
|
||||
# single transfomer blocks
|
||||
# single transformer blocks
|
||||
for i in range(num_single_layers):
|
||||
block_prefix = f"single_transformer_blocks.{i}."
|
||||
# norm.linear <- single_blocks.0.modulation.lin
|
||||
|
||||
@@ -394,7 +394,7 @@ if __name__ == "__main__":
|
||||
help="Scheduler type to use. Use 'scm' for Sana Sprint models.",
|
||||
)
|
||||
parser.add_argument("--dump_path", default=None, type=str, required=True, help="Path to the output pipeline.")
|
||||
parser.add_argument("--save_full_pipeline", action="store_true", help="save all the pipelien elemets in one.")
|
||||
parser.add_argument("--save_full_pipeline", action="store_true", help="save all the pipeline elements in one.")
|
||||
parser.add_argument("--dtype", default="fp32", type=str, choices=["fp32", "fp16", "bf16"], help="Weight dtype.")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
@@ -984,7 +984,7 @@ def renderer(*, args, checkpoint_map_location):
|
||||
return renderer_model
|
||||
|
||||
|
||||
# prior model will expect clip_mean and clip_std, whic are missing from the state_dict
|
||||
# prior model will expect clip_mean and clip_std, which are missing from the state_dict
|
||||
PRIOR_EXPECTED_MISSING_KEYS = ["clip_mean", "clip_std"]
|
||||
|
||||
|
||||
|
||||
@@ -55,8 +55,8 @@ for key in orig_state_dict.keys():
|
||||
state_dict[key.replace("attn.out_proj.bias", "to_out.0.bias")] = weights
|
||||
else:
|
||||
state_dict[key] = orig_state_dict[key]
|
||||
deocder = WuerstchenDiffNeXt()
|
||||
deocder.load_state_dict(state_dict)
|
||||
decoder = WuerstchenDiffNeXt()
|
||||
decoder.load_state_dict(state_dict)
|
||||
|
||||
# Prior
|
||||
orig_state_dict = torch.load(os.path.join(model_path, "model_v3_stage_c.pt"), map_location=device)["ema_state_dict"]
|
||||
@@ -94,7 +94,7 @@ prior_pipeline = WuerstchenPriorPipeline(
|
||||
prior_pipeline.save_pretrained("warp-ai/wuerstchen-prior")
|
||||
|
||||
decoder_pipeline = WuerstchenDecoderPipeline(
|
||||
text_encoder=gen_text_encoder, tokenizer=gen_tokenizer, vqgan=vqmodel, decoder=deocder, scheduler=scheduler
|
||||
text_encoder=gen_text_encoder, tokenizer=gen_tokenizer, vqgan=vqmodel, decoder=decoder, scheduler=scheduler
|
||||
)
|
||||
decoder_pipeline.save_pretrained("warp-ai/wuerstchen")
|
||||
|
||||
@@ -103,7 +103,7 @@ wuerstchen_pipeline = WuerstchenCombinedPipeline(
|
||||
# Decoder
|
||||
text_encoder=gen_text_encoder,
|
||||
tokenizer=gen_tokenizer,
|
||||
decoder=deocder,
|
||||
decoder=decoder,
|
||||
scheduler=scheduler,
|
||||
vqgan=vqmodel,
|
||||
# Prior
|
||||
|
||||
@@ -116,7 +116,7 @@ _deps = [
|
||||
"librosa",
|
||||
"numpy",
|
||||
"parameterized",
|
||||
"peft>=0.6.0",
|
||||
"peft>=0.15.0",
|
||||
"protobuf>=3.20.3,<4",
|
||||
"pytest",
|
||||
"pytest-timeout",
|
||||
|
||||
@@ -23,7 +23,7 @@ deps = {
|
||||
"librosa": "librosa",
|
||||
"numpy": "numpy",
|
||||
"parameterized": "parameterized",
|
||||
"peft": "peft>=0.6.0",
|
||||
"peft": "peft>=0.15.0",
|
||||
"protobuf": "protobuf>=3.20.3,<4",
|
||||
"pytest": "pytest",
|
||||
"pytest-timeout": "pytest-timeout",
|
||||
|
||||
@@ -13,7 +13,7 @@
|
||||
# limitations under the License.
|
||||
|
||||
from contextlib import contextmanager, nullcontext
|
||||
from typing import Dict, List, Optional, Set, Tuple
|
||||
from typing import Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
@@ -55,7 +55,7 @@ class ModuleGroup:
|
||||
parameters: Optional[List[torch.nn.Parameter]] = None,
|
||||
buffers: Optional[List[torch.Tensor]] = None,
|
||||
non_blocking: bool = False,
|
||||
stream: Optional[torch.cuda.Stream] = None,
|
||||
stream: Union[torch.cuda.Stream, torch.Stream, None] = None,
|
||||
record_stream: Optional[bool] = False,
|
||||
low_cpu_mem_usage: bool = False,
|
||||
onload_self: bool = True,
|
||||
@@ -115,8 +115,13 @@ class ModuleGroup:
|
||||
|
||||
def onload_(self):
|
||||
r"""Onloads the group of modules to the onload_device."""
|
||||
context = nullcontext() if self.stream is None else torch.cuda.stream(self.stream)
|
||||
current_stream = torch.cuda.current_stream() if self.record_stream else None
|
||||
torch_accelerator_module = (
|
||||
getattr(torch, torch.accelerator.current_accelerator().type)
|
||||
if hasattr(torch, "accelerator")
|
||||
else torch.cuda
|
||||
)
|
||||
context = nullcontext() if self.stream is None else torch_accelerator_module.stream(self.stream)
|
||||
current_stream = torch_accelerator_module.current_stream() if self.record_stream else None
|
||||
|
||||
if self.stream is not None:
|
||||
# Wait for previous Host->Device transfer to complete
|
||||
@@ -162,9 +167,15 @@ class ModuleGroup:
|
||||
|
||||
def offload_(self):
|
||||
r"""Offloads the group of modules to the offload_device."""
|
||||
|
||||
torch_accelerator_module = (
|
||||
getattr(torch, torch.accelerator.current_accelerator().type)
|
||||
if hasattr(torch, "accelerator")
|
||||
else torch.cuda
|
||||
)
|
||||
if self.stream is not None:
|
||||
if not self.record_stream:
|
||||
torch.cuda.current_stream().synchronize()
|
||||
torch_accelerator_module.current_stream().synchronize()
|
||||
for group_module in self.modules:
|
||||
for param in group_module.parameters():
|
||||
param.data = self.cpu_param_dict[param]
|
||||
@@ -232,7 +243,7 @@ class GroupOffloadingHook(ModelHook):
|
||||
|
||||
class LazyPrefetchGroupOffloadingHook(ModelHook):
|
||||
r"""
|
||||
A hook, used in conjuction with GroupOffloadingHook, that applies lazy prefetching to groups of torch.nn.Module.
|
||||
A hook, used in conjunction with GroupOffloadingHook, that applies lazy prefetching to groups of torch.nn.Module.
|
||||
This hook is used to determine the order in which the layers are executed during the forward pass. Once the layer
|
||||
invocation order is known, assignments of the next_group attribute for prefetching can be made, which allows
|
||||
prefetching groups in the correct order.
|
||||
@@ -429,8 +440,10 @@ def apply_group_offloading(
|
||||
if use_stream:
|
||||
if torch.cuda.is_available():
|
||||
stream = torch.cuda.Stream()
|
||||
elif hasattr(torch, "xpu") and torch.xpu.is_available():
|
||||
stream = torch.Stream()
|
||||
else:
|
||||
raise ValueError("Using streams for data transfer requires a CUDA device.")
|
||||
raise ValueError("Using streams for data transfer requires a CUDA device, or an Intel XPU device.")
|
||||
|
||||
_raise_error_if_accelerate_model_or_sequential_hook_present(module)
|
||||
|
||||
@@ -468,7 +481,7 @@ def _apply_group_offloading_block_level(
|
||||
offload_device: torch.device,
|
||||
onload_device: torch.device,
|
||||
non_blocking: bool,
|
||||
stream: Optional[torch.cuda.Stream] = None,
|
||||
stream: Union[torch.cuda.Stream, torch.Stream, None] = None,
|
||||
record_stream: Optional[bool] = False,
|
||||
low_cpu_mem_usage: bool = False,
|
||||
) -> None:
|
||||
@@ -486,7 +499,7 @@ def _apply_group_offloading_block_level(
|
||||
non_blocking (`bool`):
|
||||
If True, offloading and onloading is done asynchronously. This can be useful for overlapping computation
|
||||
and data transfer.
|
||||
stream (`torch.cuda.Stream`, *optional*):
|
||||
stream (`torch.cuda.Stream`or `torch.Stream`, *optional*):
|
||||
If provided, offloading and onloading is done asynchronously using the provided stream. This can be useful
|
||||
for overlapping computation and data transfer.
|
||||
record_stream (`bool`, defaults to `False`): When enabled with `use_stream`, it marks the current tensor
|
||||
@@ -499,7 +512,10 @@ def _apply_group_offloading_block_level(
|
||||
the CPU memory is a bottleneck but may counteract the benefits of using streams.
|
||||
"""
|
||||
if stream is not None and num_blocks_per_group != 1:
|
||||
raise ValueError(f"Using streams is only supported for num_blocks_per_group=1. Got {num_blocks_per_group=}.")
|
||||
logger.warning(
|
||||
f"Using streams is only supported for num_blocks_per_group=1. Got {num_blocks_per_group=}. Setting it to 1."
|
||||
)
|
||||
num_blocks_per_group = 1
|
||||
|
||||
# Create module groups for ModuleList and Sequential blocks
|
||||
modules_with_group_offloading = set()
|
||||
@@ -569,7 +585,7 @@ def _apply_group_offloading_leaf_level(
|
||||
offload_device: torch.device,
|
||||
onload_device: torch.device,
|
||||
non_blocking: bool,
|
||||
stream: Optional[torch.cuda.Stream] = None,
|
||||
stream: Union[torch.cuda.Stream, torch.Stream, None] = None,
|
||||
record_stream: Optional[bool] = False,
|
||||
low_cpu_mem_usage: bool = False,
|
||||
) -> None:
|
||||
@@ -589,7 +605,7 @@ def _apply_group_offloading_leaf_level(
|
||||
non_blocking (`bool`):
|
||||
If True, offloading and onloading is done asynchronously. This can be useful for overlapping computation
|
||||
and data transfer.
|
||||
stream (`torch.cuda.Stream`, *optional*):
|
||||
stream (`torch.cuda.Stream` or `torch.Stream`, *optional*):
|
||||
If provided, offloading and onloading is done asynchronously using the provided stream. This can be useful
|
||||
for overlapping computation and data transfer.
|
||||
record_stream (`bool`, defaults to `False`): When enabled with `use_stream`, it marks the current tensor
|
||||
|
||||
@@ -90,7 +90,7 @@ class PeftInputAutocastDisableHook(ModelHook):
|
||||
that the inputs are casted to the computation dtype correctly always. However, there are two goals we are
|
||||
hoping to achieve:
|
||||
1. Making forward implementations independent of device/dtype casting operations as much as possible.
|
||||
2. Peforming inference without losing information from casting to different precisions. With the current
|
||||
2. Performing inference without losing information from casting to different precisions. With the current
|
||||
PEFT implementation (as linked in the reference above), and assuming running layerwise casting inference
|
||||
with storage_dtype=torch.float8_e4m3fn and compute_dtype=torch.bfloat16, inputs are cast to
|
||||
torch.float8_e4m3fn in the lora layer. We will then upcast back to torch.bfloat16 when we continue the
|
||||
|
||||
@@ -819,7 +819,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
|
||||
if zero_status_pe:
|
||||
logger.info(
|
||||
"The `position_embedding` LoRA params are all zeros which make them ineffective. "
|
||||
"So, we will purge them out of the curret state dict to make loading possible."
|
||||
"So, we will purge them out of the current state dict to make loading possible."
|
||||
)
|
||||
|
||||
else:
|
||||
@@ -835,7 +835,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
|
||||
if zero_status_t5:
|
||||
logger.info(
|
||||
"The `t5xxl` LoRA params are all zeros which make them ineffective. "
|
||||
"So, we will purge them out of the curret state dict to make loading possible."
|
||||
"So, we will purge them out of the current state dict to make loading possible."
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
@@ -850,7 +850,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
|
||||
if zero_status_diff_b:
|
||||
logger.info(
|
||||
"The `diff_b` LoRA params are all zeros which make them ineffective. "
|
||||
"So, we will purge them out of the curret state dict to make loading possible."
|
||||
"So, we will purge them out of the current state dict to make loading possible."
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
@@ -866,7 +866,7 @@ def _convert_kohya_flux_lora_to_diffusers(state_dict):
|
||||
if zero_status_diff:
|
||||
logger.info(
|
||||
"The `diff` LoRA params are all zeros which make them ineffective. "
|
||||
"So, we will purge them out of the curret state dict to make loading possible."
|
||||
"So, we will purge them out of the current state dict to make loading possible."
|
||||
)
|
||||
else:
|
||||
logger.info(
|
||||
@@ -1237,7 +1237,7 @@ def _convert_bfl_flux_control_lora_to_diffusers(original_state_dict):
|
||||
f"double_blocks.{i}.txt_attn.norm.key_norm.scale"
|
||||
)
|
||||
|
||||
# single transfomer blocks
|
||||
# single transformer blocks
|
||||
for i in range(num_single_layers):
|
||||
block_prefix = f"single_transformer_blocks.{i}."
|
||||
|
||||
|
||||
@@ -2413,7 +2413,7 @@ class FluxLoraLoaderMixin(LoraBaseMixin):
|
||||
) -> bool:
|
||||
"""
|
||||
Control LoRA expands the shape of the input layer from (3072, 64) to (3072, 128). This method handles that and
|
||||
generalizes things a bit so that any parameter that needs expansion receives appropriate treatement.
|
||||
generalizes things a bit so that any parameter that needs expansion receives appropriate treatment.
|
||||
"""
|
||||
state_dict = {}
|
||||
if lora_state_dict is not None:
|
||||
|
||||
@@ -330,7 +330,7 @@ class PeftAdapterMixin:
|
||||
new_sd[k] = v
|
||||
return new_sd
|
||||
|
||||
# To handle scenarios where we cannot successfully set state dict. If it's unsucessful,
|
||||
# To handle scenarios where we cannot successfully set state dict. If it's unsuccessful,
|
||||
# we should also delete the `peft_config` associated to the `adapter_name`.
|
||||
try:
|
||||
if hotswap:
|
||||
@@ -344,7 +344,7 @@ class PeftAdapterMixin:
|
||||
config=lora_config,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Hotswapping {adapter_name} was unsucessful with the following error: \n{e}")
|
||||
logger.error(f"Hotswapping {adapter_name} was unsuccessful with the following error: \n{e}")
|
||||
raise
|
||||
# the hotswap function raises if there are incompatible keys, so if we reach this point we can set
|
||||
# it to None
|
||||
@@ -379,7 +379,7 @@ class PeftAdapterMixin:
|
||||
module.delete_adapter(adapter_name)
|
||||
|
||||
self.peft_config.pop(adapter_name)
|
||||
logger.error(f"Loading {adapter_name} was unsucessful with the following error: \n{e}")
|
||||
logger.error(f"Loading {adapter_name} was unsuccessful with the following error: \n{e}")
|
||||
raise
|
||||
|
||||
warn_msg = ""
|
||||
@@ -712,7 +712,7 @@ class PeftAdapterMixin:
|
||||
if self.lora_scale != 1.0:
|
||||
module.scale_layer(self.lora_scale)
|
||||
|
||||
# For BC with prevous PEFT versions, we need to check the signature
|
||||
# For BC with previous PEFT versions, we need to check the signature
|
||||
# of the `merge` method to see if it supports the `adapter_names` argument.
|
||||
supported_merge_kwargs = list(inspect.signature(module.merge).parameters)
|
||||
if "adapter_names" in supported_merge_kwargs:
|
||||
|
||||
@@ -453,7 +453,7 @@ class FromSingleFileMixin:
|
||||
logger.warning(
|
||||
"Detected legacy `from_single_file` loading behavior. Attempting to create the pipeline based on inferred components.\n"
|
||||
"This may lead to errors if the model components are not correctly inferred. \n"
|
||||
"To avoid this warning, please explicity pass the `config` argument to `from_single_file` with a path to a local diffusers model repo \n"
|
||||
"To avoid this warning, please explicitly pass the `config` argument to `from_single_file` with a path to a local diffusers model repo \n"
|
||||
"e.g. `from_single_file(<my model checkpoint path>, config=<path to local diffusers model repo>) \n"
|
||||
"or run `from_single_file` with `local_files_only=False` first to update the local cache directory with "
|
||||
"the necessary config files.\n"
|
||||
|
||||
@@ -2278,7 +2278,7 @@ def convert_flux_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
|
||||
f"double_blocks.{i}.txt_attn.proj.bias"
|
||||
)
|
||||
|
||||
# single transfomer blocks
|
||||
# single transformer blocks
|
||||
for i in range(num_single_layers):
|
||||
block_prefix = f"single_transformer_blocks.{i}."
|
||||
# norm.linear <- single_blocks.0.modulation.lin
|
||||
@@ -2872,7 +2872,7 @@ def convert_auraflow_transformer_checkpoint_to_diffusers(checkpoint, **kwargs):
|
||||
def convert_lumina2_to_diffusers(checkpoint, **kwargs):
|
||||
converted_state_dict = {}
|
||||
|
||||
# Original Lumina-Image-2 has an extra norm paramter that is unused
|
||||
# Original Lumina-Image-2 has an extra norm parameter that is unused
|
||||
# We just remove it here
|
||||
checkpoint.pop("norm_final.weight", None)
|
||||
|
||||
|
||||
@@ -123,7 +123,7 @@ class SD3Transformer2DLoadersMixin:
|
||||
key = key.replace(f"layers.{idx}.2.1", f"layers.{idx}.adaln_proj")
|
||||
updated_state_dict[key] = value
|
||||
|
||||
# Image projetion parameters
|
||||
# Image projection parameters
|
||||
embed_dim = updated_state_dict["proj_in.weight"].shape[1]
|
||||
output_dim = updated_state_dict["proj_out.weight"].shape[0]
|
||||
hidden_dim = updated_state_dict["proj_in.weight"].shape[0]
|
||||
|
||||
@@ -730,6 +730,76 @@ class AutoencoderKLWan(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
base_dim, z_dim, dim_mult, num_res_blocks, attn_scales, self.temperal_upsample, dropout
|
||||
)
|
||||
|
||||
self.spatial_compression_ratio = 2 ** len(self.temperal_downsample)
|
||||
|
||||
# When decoding a batch of video latents at a time, one can save memory by slicing across the batch dimension
|
||||
# to perform decoding of a single video latent at a time.
|
||||
self.use_slicing = False
|
||||
|
||||
# When decoding spatially large video latents, the memory requirement is very high. By breaking the video latent
|
||||
# frames spatially into smaller tiles and performing multiple forward passes for decoding, and then blending the
|
||||
# intermediate tiles together, the memory requirement can be lowered.
|
||||
self.use_tiling = False
|
||||
|
||||
# The minimal tile height and width for spatial tiling to be used
|
||||
self.tile_sample_min_height = 256
|
||||
self.tile_sample_min_width = 256
|
||||
|
||||
# The minimal distance between two spatial tiles
|
||||
self.tile_sample_stride_height = 192
|
||||
self.tile_sample_stride_width = 192
|
||||
|
||||
def enable_tiling(
|
||||
self,
|
||||
tile_sample_min_height: Optional[int] = None,
|
||||
tile_sample_min_width: Optional[int] = None,
|
||||
tile_sample_stride_height: Optional[float] = None,
|
||||
tile_sample_stride_width: Optional[float] = None,
|
||||
) -> None:
|
||||
r"""
|
||||
Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
|
||||
compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
|
||||
processing larger images.
|
||||
|
||||
Args:
|
||||
tile_sample_min_height (`int`, *optional*):
|
||||
The minimum height required for a sample to be separated into tiles across the height dimension.
|
||||
tile_sample_min_width (`int`, *optional*):
|
||||
The minimum width required for a sample to be separated into tiles across the width dimension.
|
||||
tile_sample_stride_height (`int`, *optional*):
|
||||
The minimum amount of overlap between two consecutive vertical tiles. This is to ensure that there are
|
||||
no tiling artifacts produced across the height dimension.
|
||||
tile_sample_stride_width (`int`, *optional*):
|
||||
The stride between two consecutive horizontal tiles. This is to ensure that there are no tiling
|
||||
artifacts produced across the width dimension.
|
||||
"""
|
||||
self.use_tiling = True
|
||||
self.tile_sample_min_height = tile_sample_min_height or self.tile_sample_min_height
|
||||
self.tile_sample_min_width = tile_sample_min_width or self.tile_sample_min_width
|
||||
self.tile_sample_stride_height = tile_sample_stride_height or self.tile_sample_stride_height
|
||||
self.tile_sample_stride_width = tile_sample_stride_width or self.tile_sample_stride_width
|
||||
|
||||
def disable_tiling(self) -> None:
|
||||
r"""
|
||||
Disable tiled VAE decoding. If `enable_tiling` was previously enabled, this method will go back to computing
|
||||
decoding in one step.
|
||||
"""
|
||||
self.use_tiling = False
|
||||
|
||||
def enable_slicing(self) -> None:
|
||||
r"""
|
||||
Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
|
||||
compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
|
||||
"""
|
||||
self.use_slicing = True
|
||||
|
||||
def disable_slicing(self) -> None:
|
||||
r"""
|
||||
Disable sliced VAE decoding. If `enable_slicing` was previously enabled, this method will go back to computing
|
||||
decoding in one step.
|
||||
"""
|
||||
self.use_slicing = False
|
||||
|
||||
def clear_cache(self):
|
||||
def _count_conv3d(model):
|
||||
count = 0
|
||||
@@ -746,11 +816,14 @@ class AutoencoderKLWan(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
self._enc_conv_idx = [0]
|
||||
self._enc_feat_map = [None] * self._enc_conv_num
|
||||
|
||||
def _encode(self, x: torch.Tensor) -> torch.Tensor:
|
||||
def _encode(self, x: torch.Tensor):
|
||||
_, _, num_frame, height, width = x.shape
|
||||
|
||||
if self.use_tiling and (width > self.tile_sample_min_width or height > self.tile_sample_min_height):
|
||||
return self.tiled_encode(x)
|
||||
|
||||
self.clear_cache()
|
||||
## cache
|
||||
t = x.shape[2]
|
||||
iter_ = 1 + (t - 1) // 4
|
||||
iter_ = 1 + (num_frame - 1) // 4
|
||||
for i in range(iter_):
|
||||
self._enc_conv_idx = [0]
|
||||
if i == 0:
|
||||
@@ -764,8 +837,6 @@ class AutoencoderKLWan(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
out = torch.cat([out, out_], 2)
|
||||
|
||||
enc = self.quant_conv(out)
|
||||
mu, logvar = enc[:, : self.z_dim, :, :, :], enc[:, self.z_dim :, :, :, :]
|
||||
enc = torch.cat([mu, logvar], dim=1)
|
||||
self.clear_cache()
|
||||
return enc
|
||||
|
||||
@@ -785,18 +856,28 @@ class AutoencoderKLWan(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
The latent representations of the encoded videos. If `return_dict` is True, a
|
||||
[`~models.autoencoder_kl.AutoencoderKLOutput`] is returned, otherwise a plain `tuple` is returned.
|
||||
"""
|
||||
h = self._encode(x)
|
||||
if self.use_slicing and x.shape[0] > 1:
|
||||
encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
|
||||
h = torch.cat(encoded_slices)
|
||||
else:
|
||||
h = self._encode(x)
|
||||
posterior = DiagonalGaussianDistribution(h)
|
||||
|
||||
if not return_dict:
|
||||
return (posterior,)
|
||||
return AutoencoderKLOutput(latent_dist=posterior)
|
||||
|
||||
def _decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
|
||||
self.clear_cache()
|
||||
def _decode(self, z: torch.Tensor, return_dict: bool = True):
|
||||
_, _, num_frame, height, width = z.shape
|
||||
tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
|
||||
tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
|
||||
|
||||
iter_ = z.shape[2]
|
||||
if self.use_tiling and (width > tile_latent_min_width or height > tile_latent_min_height):
|
||||
return self.tiled_decode(z, return_dict=return_dict)
|
||||
|
||||
self.clear_cache()
|
||||
x = self.post_quant_conv(z)
|
||||
for i in range(iter_):
|
||||
for i in range(num_frame):
|
||||
self._conv_idx = [0]
|
||||
if i == 0:
|
||||
out = self.decoder(x[:, :, i : i + 1, :, :], feat_cache=self._feat_map, feat_idx=self._conv_idx)
|
||||
@@ -826,12 +907,161 @@ class AutoencoderKLWan(ModelMixin, ConfigMixin, FromOriginalModelMixin):
|
||||
If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
|
||||
returned.
|
||||
"""
|
||||
decoded = self._decode(z).sample
|
||||
if self.use_slicing and z.shape[0] > 1:
|
||||
decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
|
||||
decoded = torch.cat(decoded_slices)
|
||||
else:
|
||||
decoded = self._decode(z).sample
|
||||
|
||||
if not return_dict:
|
||||
return (decoded,)
|
||||
|
||||
return DecoderOutput(sample=decoded)
|
||||
|
||||
def blend_v(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
|
||||
blend_extent = min(a.shape[-2], b.shape[-2], blend_extent)
|
||||
for y in range(blend_extent):
|
||||
b[:, :, :, y, :] = a[:, :, :, -blend_extent + y, :] * (1 - y / blend_extent) + b[:, :, :, y, :] * (
|
||||
y / blend_extent
|
||||
)
|
||||
return b
|
||||
|
||||
def blend_h(self, a: torch.Tensor, b: torch.Tensor, blend_extent: int) -> torch.Tensor:
|
||||
blend_extent = min(a.shape[-1], b.shape[-1], blend_extent)
|
||||
for x in range(blend_extent):
|
||||
b[:, :, :, :, x] = a[:, :, :, :, -blend_extent + x] * (1 - x / blend_extent) + b[:, :, :, :, x] * (
|
||||
x / blend_extent
|
||||
)
|
||||
return b
|
||||
|
||||
def tiled_encode(self, x: torch.Tensor) -> AutoencoderKLOutput:
|
||||
r"""Encode a batch of images using a tiled encoder.
|
||||
|
||||
Args:
|
||||
x (`torch.Tensor`): Input batch of videos.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The latent representation of the encoded videos.
|
||||
"""
|
||||
_, _, num_frames, height, width = x.shape
|
||||
latent_height = height // self.spatial_compression_ratio
|
||||
latent_width = width // self.spatial_compression_ratio
|
||||
|
||||
tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
|
||||
tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
|
||||
tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
|
||||
tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
|
||||
|
||||
blend_height = tile_latent_min_height - tile_latent_stride_height
|
||||
blend_width = tile_latent_min_width - tile_latent_stride_width
|
||||
|
||||
# Split x into overlapping tiles and encode them separately.
|
||||
# The tiles have an overlap to avoid seams between tiles.
|
||||
rows = []
|
||||
for i in range(0, height, self.tile_sample_stride_height):
|
||||
row = []
|
||||
for j in range(0, width, self.tile_sample_stride_width):
|
||||
self.clear_cache()
|
||||
time = []
|
||||
frame_range = 1 + (num_frames - 1) // 4
|
||||
for k in range(frame_range):
|
||||
self._enc_conv_idx = [0]
|
||||
if k == 0:
|
||||
tile = x[:, :, :1, i : i + self.tile_sample_min_height, j : j + self.tile_sample_min_width]
|
||||
else:
|
||||
tile = x[
|
||||
:,
|
||||
:,
|
||||
1 + 4 * (k - 1) : 1 + 4 * k,
|
||||
i : i + self.tile_sample_min_height,
|
||||
j : j + self.tile_sample_min_width,
|
||||
]
|
||||
tile = self.encoder(tile, feat_cache=self._enc_feat_map, feat_idx=self._enc_conv_idx)
|
||||
tile = self.quant_conv(tile)
|
||||
time.append(tile)
|
||||
row.append(torch.cat(time, dim=2))
|
||||
rows.append(row)
|
||||
self.clear_cache()
|
||||
|
||||
result_rows = []
|
||||
for i, row in enumerate(rows):
|
||||
result_row = []
|
||||
for j, tile in enumerate(row):
|
||||
# blend the above tile and the left tile
|
||||
# to the current tile and add the current tile to the result row
|
||||
if i > 0:
|
||||
tile = self.blend_v(rows[i - 1][j], tile, blend_height)
|
||||
if j > 0:
|
||||
tile = self.blend_h(row[j - 1], tile, blend_width)
|
||||
result_row.append(tile[:, :, :, :tile_latent_stride_height, :tile_latent_stride_width])
|
||||
result_rows.append(torch.cat(result_row, dim=-1))
|
||||
|
||||
enc = torch.cat(result_rows, dim=3)[:, :, :, :latent_height, :latent_width]
|
||||
return enc
|
||||
|
||||
def tiled_decode(self, z: torch.Tensor, return_dict: bool = True) -> Union[DecoderOutput, torch.Tensor]:
|
||||
r"""
|
||||
Decode a batch of images using a tiled decoder.
|
||||
|
||||
Args:
|
||||
z (`torch.Tensor`): Input batch of latent vectors.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~models.vae.DecoderOutput`] instead of a plain tuple.
|
||||
|
||||
Returns:
|
||||
[`~models.vae.DecoderOutput`] or `tuple`:
|
||||
If return_dict is True, a [`~models.vae.DecoderOutput`] is returned, otherwise a plain `tuple` is
|
||||
returned.
|
||||
"""
|
||||
_, _, num_frames, height, width = z.shape
|
||||
sample_height = height * self.spatial_compression_ratio
|
||||
sample_width = width * self.spatial_compression_ratio
|
||||
|
||||
tile_latent_min_height = self.tile_sample_min_height // self.spatial_compression_ratio
|
||||
tile_latent_min_width = self.tile_sample_min_width // self.spatial_compression_ratio
|
||||
tile_latent_stride_height = self.tile_sample_stride_height // self.spatial_compression_ratio
|
||||
tile_latent_stride_width = self.tile_sample_stride_width // self.spatial_compression_ratio
|
||||
|
||||
blend_height = self.tile_sample_min_height - self.tile_sample_stride_height
|
||||
blend_width = self.tile_sample_min_width - self.tile_sample_stride_width
|
||||
|
||||
# Split z into overlapping tiles and decode them separately.
|
||||
# The tiles have an overlap to avoid seams between tiles.
|
||||
rows = []
|
||||
for i in range(0, height, tile_latent_stride_height):
|
||||
row = []
|
||||
for j in range(0, width, tile_latent_stride_width):
|
||||
self.clear_cache()
|
||||
time = []
|
||||
for k in range(num_frames):
|
||||
self._conv_idx = [0]
|
||||
tile = z[:, :, k : k + 1, i : i + tile_latent_min_height, j : j + tile_latent_min_width]
|
||||
tile = self.post_quant_conv(tile)
|
||||
decoded = self.decoder(tile, feat_cache=self._feat_map, feat_idx=self._conv_idx)
|
||||
time.append(decoded)
|
||||
row.append(torch.cat(time, dim=2))
|
||||
rows.append(row)
|
||||
self.clear_cache()
|
||||
|
||||
result_rows = []
|
||||
for i, row in enumerate(rows):
|
||||
result_row = []
|
||||
for j, tile in enumerate(row):
|
||||
# blend the above tile and the left tile
|
||||
# to the current tile and add the current tile to the result row
|
||||
if i > 0:
|
||||
tile = self.blend_v(rows[i - 1][j], tile, blend_height)
|
||||
if j > 0:
|
||||
tile = self.blend_h(row[j - 1], tile, blend_width)
|
||||
result_row.append(tile[:, :, :, : self.tile_sample_stride_height, : self.tile_sample_stride_width])
|
||||
result_rows.append(torch.cat(result_row, dim=-1))
|
||||
|
||||
dec = torch.cat(result_rows, dim=3)[:, :, :, :sample_height, :sample_width]
|
||||
|
||||
if not return_dict:
|
||||
return (dec,)
|
||||
return DecoderOutput(sample=dec)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
sample: torch.Tensor,
|
||||
|
||||
@@ -734,17 +734,17 @@ class UNetControlNetXSModel(ModelMixin, ConfigMixin):
|
||||
unet (`UNet2DConditionModel`):
|
||||
The UNet model we want to control.
|
||||
controlnet (`ControlNetXSAdapter`):
|
||||
The ConntrolNet-XS adapter with which the UNet will be fused. If none is given, a new ConntrolNet-XS
|
||||
The ControlNet-XS adapter with which the UNet will be fused. If none is given, a new ControlNet-XS
|
||||
adapter will be created.
|
||||
size_ratio (float, *optional*, defaults to `None`):
|
||||
Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
|
||||
Used to construct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
|
||||
ctrl_block_out_channels (`List[int]`, *optional*, defaults to `None`):
|
||||
Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details,
|
||||
Used to construct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details,
|
||||
where this parameter is called `block_out_channels`.
|
||||
time_embedding_mix (`float`, *optional*, defaults to None):
|
||||
Used to contruct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
|
||||
Used to construct the controlnet if none is given. See [`ControlNetXSAdapter.from_unet`] for details.
|
||||
ctrl_optional_kwargs (`Dict`, *optional*, defaults to `None`):
|
||||
Passed to the `init` of the new controlent if no controlent was given.
|
||||
Passed to the `init` of the new controlnet if no controlnet was given.
|
||||
"""
|
||||
if controlnet is None:
|
||||
controlnet = ControlNetXSAdapter.from_unet(
|
||||
|
||||
@@ -97,7 +97,7 @@ def get_3d_sincos_pos_embed(
|
||||
The spatial dimension of positional embeddings. If an integer is provided, the same size is applied to both
|
||||
spatial dimensions (height and width).
|
||||
temporal_size (`int`):
|
||||
The temporal dimension of postional embeddings (number of frames).
|
||||
The temporal dimension of positional embeddings (number of frames).
|
||||
spatial_interpolation_scale (`float`, defaults to 1.0):
|
||||
Scale factor for spatial grid interpolation.
|
||||
temporal_interpolation_scale (`float`, defaults to 1.0):
|
||||
@@ -169,7 +169,7 @@ def _get_3d_sincos_pos_embed_np(
|
||||
The spatial dimension of positional embeddings. If an integer is provided, the same size is applied to both
|
||||
spatial dimensions (height and width).
|
||||
temporal_size (`int`):
|
||||
The temporal dimension of postional embeddings (number of frames).
|
||||
The temporal dimension of positional embeddings (number of frames).
|
||||
spatial_interpolation_scale (`float`, defaults to 1.0):
|
||||
Scale factor for spatial grid interpolation.
|
||||
temporal_interpolation_scale (`float`, defaults to 1.0):
|
||||
|
||||
@@ -30,7 +30,7 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin, CacheMixin):
|
||||
_supports_gradient_checkpointing = True
|
||||
|
||||
"""
|
||||
A 3D Transformer model for video-like data, paper: https://arxiv.org/abs/2401.03048, offical code:
|
||||
A 3D Transformer model for video-like data, paper: https://arxiv.org/abs/2401.03048, official code:
|
||||
https://github.com/Vchitect/Latte
|
||||
|
||||
Parameters:
|
||||
@@ -216,7 +216,7 @@ class LatteTransformer3DModel(ModelMixin, ConfigMixin, CacheMixin):
|
||||
)
|
||||
num_patches = height * width
|
||||
|
||||
hidden_states = self.pos_embed(hidden_states) # alrady add positional embeddings
|
||||
hidden_states = self.pos_embed(hidden_states) # already add positional embeddings
|
||||
|
||||
added_cond_kwargs = {"resolution": None, "aspect_ratio": None}
|
||||
timestep, embedded_timestep = self.adaln_single(
|
||||
|
||||
@@ -43,7 +43,7 @@ class LuminaNextDiTBlock(nn.Module):
|
||||
num_kv_heads (`int`):
|
||||
Number of attention heads in key and value features (if using GQA), or set to None for the same as query.
|
||||
multiple_of (`int`): The number of multiple of ffn layer.
|
||||
ffn_dim_multiplier (`float`): The multipier factor of ffn layer dimension.
|
||||
ffn_dim_multiplier (`float`): The multiplier factor of ffn layer dimension.
|
||||
norm_eps (`float`): The eps for norm layer.
|
||||
qk_norm (`bool`): normalization for query and key.
|
||||
cross_attention_dim (`int`): Cross attention embedding dimension of the input text prompt hidden_states.
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user