Compare commits
32 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 215af1a602 | |||
| 1a917d3ac5 | |||
| 65efbcead5 | |||
| e58711e73c | |||
| cbecc33570 | |||
| 5237a82a35 | |||
| 513dbdb2f3 | |||
| 865ba102b3 | |||
| 552c127c05 | |||
| 4b7fe044e3 | |||
| 532f41c999 | |||
| 5fcd5f560f | |||
| 0fd7ee79ea | |||
| 2a52a25b9a | |||
| 0137a16ed5 | |||
| ce12925a23 | |||
| 80b06b0d5f | |||
| 42c19fdd0d | |||
| 0d1c5b0c3e | |||
| 0e46c55931 | |||
| 8f8888a76e | |||
| afc9721898 | |||
| 2c4ee10b77 | |||
| cf1ca728ea | |||
| 144e6e2540 | |||
| 22b229ba66 | |||
| a840c39ad8 | |||
| 9a7ae77a4e | |||
| 673d4357ff | |||
| 561ab54de3 | |||
| b60faf456b | |||
| 3e73dc24a4 |
@@ -1,38 +0,0 @@
|
||||
name: Run Flax dependency tests
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "src/diffusers/**.py"
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
|
||||
concurrency:
|
||||
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
|
||||
cancel-in-progress: true
|
||||
|
||||
jobs:
|
||||
check_flax_dependencies:
|
||||
runs-on: ubuntu-22.04
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
python -m pip install --upgrade pip uv
|
||||
python -m uv pip install -e .
|
||||
python -m uv pip install "jax[cpu]>=0.2.16,!=0.3.2"
|
||||
python -m uv pip install "flax>=0.4.1"
|
||||
python -m uv pip install "jaxlib>=0.1.65"
|
||||
python -m uv pip install pytest
|
||||
- name: Check for soft dependencies
|
||||
run: |
|
||||
python -m venv /opt/venv && export PATH="/opt/venv/bin:$PATH"
|
||||
pytest tests/others/test_dependencies.py
|
||||
@@ -37,7 +37,7 @@ limitations under the License.
|
||||
|
||||
## Installation
|
||||
|
||||
We recommend installing 🤗 Diffusers in a virtual environment from PyPI or Conda. For more details about installing [PyTorch](https://pytorch.org/get-started/locally/) and [Flax](https://flax.readthedocs.io/en/latest/#installation), please refer to their official documentation.
|
||||
We recommend installing 🤗 Diffusers in a virtual environment from PyPI or Conda. For more details about installing [PyTorch](https://pytorch.org/get-started/locally/), please refer to their official documentation.
|
||||
|
||||
### PyTorch
|
||||
|
||||
@@ -53,14 +53,6 @@ With `conda` (maintained by the community):
|
||||
conda install -c conda-forge diffusers
|
||||
```
|
||||
|
||||
### Flax
|
||||
|
||||
With `pip` (official package):
|
||||
|
||||
```bash
|
||||
pip install --upgrade diffusers[flax]
|
||||
```
|
||||
|
||||
### Apple Silicon (M1/M2) support
|
||||
|
||||
Please refer to the [How to use Stable Diffusion in Apple Silicon](https://huggingface.co/docs/diffusers/optimization/mps) guide.
|
||||
|
||||
@@ -1,49 +0,0 @@
|
||||
FROM ubuntu:20.04
|
||||
LABEL maintainer="Hugging Face"
|
||||
LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get -y update \
|
||||
&& apt-get install -y software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa
|
||||
|
||||
RUN apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
libgl1 \
|
||||
python3.10 \
|
||||
python3-pip \
|
||||
python3.10-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3.10 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
# follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3 -m uv pip install --upgrade --no-cache-dir \
|
||||
clu \
|
||||
"jax[cpu]>=0.2.16,!=0.3.2" \
|
||||
"flax>=0.4.1" \
|
||||
"jaxlib>=0.1.65" && \
|
||||
python3 -m uv pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
Jinja2 \
|
||||
librosa \
|
||||
numpy==1.26.4 \
|
||||
scipy \
|
||||
tensorboard \
|
||||
transformers \
|
||||
hf_transfer
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -1,51 +0,0 @@
|
||||
FROM ubuntu:20.04
|
||||
LABEL maintainer="Hugging Face"
|
||||
LABEL repository="diffusers"
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get -y update \
|
||||
&& apt-get install -y software-properties-common \
|
||||
&& add-apt-repository ppa:deadsnakes/ppa
|
||||
|
||||
RUN apt install -y bash \
|
||||
build-essential \
|
||||
git \
|
||||
git-lfs \
|
||||
curl \
|
||||
ca-certificates \
|
||||
libsndfile1-dev \
|
||||
libgl1 \
|
||||
python3.10 \
|
||||
python3-pip \
|
||||
python3.10-venv && \
|
||||
rm -rf /var/lib/apt/lists
|
||||
|
||||
# make sure to use venv
|
||||
RUN python3.10 -m venv /opt/venv
|
||||
ENV PATH="/opt/venv/bin:$PATH"
|
||||
|
||||
# pre-install the heavy dependencies (these can later be overridden by the deps from setup.py)
|
||||
# follow the instructions here: https://cloud.google.com/tpu/docs/run-in-container#train_a_jax_model_in_a_docker_container
|
||||
RUN python3 -m pip install --no-cache-dir --upgrade pip uv==0.1.11 && \
|
||||
python3 -m pip install --no-cache-dir \
|
||||
"jax[tpu]>=0.2.16,!=0.3.2" \
|
||||
-f https://storage.googleapis.com/jax-releases/libtpu_releases.html && \
|
||||
python3 -m uv pip install --upgrade --no-cache-dir \
|
||||
clu \
|
||||
"flax>=0.4.1" \
|
||||
"jaxlib>=0.1.65" && \
|
||||
python3 -m uv pip install --no-cache-dir \
|
||||
accelerate \
|
||||
datasets \
|
||||
hf-doc-builder \
|
||||
huggingface-hub \
|
||||
Jinja2 \
|
||||
librosa \
|
||||
numpy==1.26.4 \
|
||||
scipy \
|
||||
tensorboard \
|
||||
transformers \
|
||||
hf_transfer
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -9,19 +9,19 @@
|
||||
- local: stable_diffusion
|
||||
title: Basic performance
|
||||
|
||||
- title: DiffusionPipeline
|
||||
- title: Pipelines
|
||||
isExpanded: false
|
||||
sections:
|
||||
- local: using-diffusers/loading
|
||||
title: Load pipelines
|
||||
title: DiffusionPipeline
|
||||
- local: tutorials/autopipeline
|
||||
title: AutoPipeline
|
||||
- local: using-diffusers/custom_pipeline_overview
|
||||
title: Load community pipelines and components
|
||||
title: Community pipelines and components
|
||||
- local: using-diffusers/callback
|
||||
title: Pipeline callbacks
|
||||
- local: using-diffusers/reusing_seeds
|
||||
title: Reproducible pipelines
|
||||
title: Reproducibility
|
||||
- local: using-diffusers/schedulers
|
||||
title: Load schedulers and models
|
||||
- local: using-diffusers/scheduler_features
|
||||
@@ -62,8 +62,6 @@
|
||||
title: Scheduler features
|
||||
- local: using-diffusers/callback
|
||||
title: Pipeline callbacks
|
||||
- local: using-diffusers/reusing_seeds
|
||||
title: Reproducible pipelines
|
||||
- local: using-diffusers/image_quality
|
||||
title: Controlling image quality
|
||||
|
||||
@@ -77,7 +75,7 @@
|
||||
- local: optimization/memory
|
||||
title: Reduce memory usage
|
||||
- local: optimization/speed-memory-optims
|
||||
title: Compile and offloading quantized models
|
||||
title: Compiling and offloading quantized models
|
||||
- title: Community optimizations
|
||||
sections:
|
||||
- local: optimization/pruna
|
||||
@@ -194,8 +192,6 @@
|
||||
- title: Model accelerators and hardware
|
||||
isExpanded: false
|
||||
sections:
|
||||
- local: using-diffusers/stable_diffusion_jax_how_to
|
||||
title: JAX/Flax
|
||||
- local: optimization/onnx
|
||||
title: ONNX
|
||||
- local: optimization/open_vino
|
||||
|
||||
@@ -44,15 +44,3 @@ model = AutoencoderKL.from_single_file(url)
|
||||
## DecoderOutput
|
||||
|
||||
[[autodoc]] models.autoencoders.vae.DecoderOutput
|
||||
|
||||
## FlaxAutoencoderKL
|
||||
|
||||
[[autodoc]] FlaxAutoencoderKL
|
||||
|
||||
## FlaxAutoencoderKLOutput
|
||||
|
||||
[[autodoc]] models.vae_flax.FlaxAutoencoderKLOutput
|
||||
|
||||
## FlaxDecoderOutput
|
||||
|
||||
[[autodoc]] models.vae_flax.FlaxDecoderOutput
|
||||
|
||||
@@ -40,11 +40,3 @@ pipe = StableDiffusionControlNetPipeline.from_single_file(url, controlnet=contro
|
||||
## ControlNetOutput
|
||||
|
||||
[[autodoc]] models.controlnets.controlnet.ControlNetOutput
|
||||
|
||||
## FlaxControlNetModel
|
||||
|
||||
[[autodoc]] FlaxControlNetModel
|
||||
|
||||
## FlaxControlNetOutput
|
||||
|
||||
[[autodoc]] models.controlnets.controlnet_flax.FlaxControlNetOutput
|
||||
|
||||
@@ -19,10 +19,6 @@ All models are built from the base [`ModelMixin`] class which is a [`torch.nn.Mo
|
||||
## ModelMixin
|
||||
[[autodoc]] ModelMixin
|
||||
|
||||
## FlaxModelMixin
|
||||
|
||||
[[autodoc]] FlaxModelMixin
|
||||
|
||||
## PushToHubMixin
|
||||
|
||||
[[autodoc]] utils.PushToHubMixin
|
||||
|
||||
@@ -23,9 +23,3 @@ The abstract from the paper is:
|
||||
|
||||
## UNet2DConditionOutput
|
||||
[[autodoc]] models.unets.unet_2d_condition.UNet2DConditionOutput
|
||||
|
||||
## FlaxUNet2DConditionModel
|
||||
[[autodoc]] models.unets.unet_2d_condition_flax.FlaxUNet2DConditionModel
|
||||
|
||||
## FlaxUNet2DConditionOutput
|
||||
[[autodoc]] models.unets.unet_2d_condition_flax.FlaxUNet2DConditionOutput
|
||||
|
||||
@@ -54,10 +54,6 @@ To check a specific pipeline or model output, refer to its corresponding API doc
|
||||
|
||||
[[autodoc]] pipelines.ImagePipelineOutput
|
||||
|
||||
## FlaxImagePipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.pipeline_flax_utils.FlaxImagePipelineOutput
|
||||
|
||||
## AudioPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.AudioPipelineOutput
|
||||
|
||||
@@ -72,11 +72,3 @@ Make sure to check out the Schedulers [guide](../../using-diffusers/schedulers)
|
||||
|
||||
## StableDiffusionPipelineOutput
|
||||
[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
|
||||
|
||||
## FlaxStableDiffusionControlNetPipeline
|
||||
[[autodoc]] FlaxStableDiffusionControlNetPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## FlaxStableDiffusionControlNetPipelineOutput
|
||||
[[autodoc]] pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput
|
||||
|
||||
@@ -106,10 +106,20 @@ The table below lists all the pipelines currently available in 🤗 Diffusers an
|
||||
|
||||
[[autodoc]] pipelines.StableDiffusionMixin.disable_freeu
|
||||
|
||||
## FlaxDiffusionPipeline
|
||||
|
||||
[[autodoc]] pipelines.pipeline_flax_utils.FlaxDiffusionPipeline
|
||||
|
||||
## PushToHubMixin
|
||||
|
||||
[[autodoc]] utils.PushToHubMixin
|
||||
|
||||
## Callbacks
|
||||
|
||||
[[autodoc]] callbacks.PipelineCallback
|
||||
|
||||
[[autodoc]] callbacks.SDCFGCutoffCallback
|
||||
|
||||
[[autodoc]] callbacks.SDXLCFGCutoffCallback
|
||||
|
||||
[[autodoc]] callbacks.SDXLControlnetCFGCutoffCallback
|
||||
|
||||
[[autodoc]] callbacks.IPAdapterScaleCutoffCallback
|
||||
|
||||
[[autodoc]] callbacks.SD3CFGCutoffCallback
|
||||
|
||||
@@ -120,6 +120,10 @@ The `guidance_scale` parameter in the pipeline is there to support future guidan
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## QwenImaggeControlNetPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## QwenImagePipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.qwenimage.pipeline_output.QwenImagePipelineOutput
|
||||
@@ -1,4 +1,4 @@
|
||||
<!-- Copyright 2024 The HuggingFace Team. All rights reserved.
|
||||
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
@@ -22,7 +22,7 @@
|
||||
|
||||
# SkyReels-V2: Infinite-length Film Generative model
|
||||
|
||||
[SkyReels-V2](https://huggingface.co/papers/2504.13074) by the SkyReels Team.
|
||||
[SkyReels-V2](https://huggingface.co/papers/2504.13074) by the SkyReels Team from Skywork AI.
|
||||
|
||||
*Recent advances in video generation have been driven by diffusion models and autoregressive frameworks, yet critical challenges persist in harmonizing prompt adherence, visual quality, motion dynamics, and duration: compromises in motion dynamics to enhance temporal visual quality, constrained video duration (5-10 seconds) to prioritize resolution, and inadequate shot-aware generation stemming from general-purpose MLLMs' inability to interpret cinematic grammar, such as shot composition, actor expressions, and camera motions. These intertwined limitations hinder realistic long-form synthesis and professional film-style generation. To address these limitations, we propose SkyReels-V2, an Infinite-length Film Generative Model, that synergizes Multi-modal Large Language Model (MLLM), Multi-stage Pretraining, Reinforcement Learning, and Diffusion Forcing Framework. Firstly, we design a comprehensive structural representation of video that combines the general descriptions by the Multi-modal LLM and the detailed shot language by sub-expert models. Aided with human annotation, we then train a unified Video Captioner, named SkyCaptioner-V1, to efficiently label the video data. Secondly, we establish progressive-resolution pretraining for the fundamental video generation, followed by a four-stage post-training enhancement: Initial concept-balanced Supervised Fine-Tuning (SFT) improves baseline quality; Motion-specific Reinforcement Learning (RL) training with human-annotated and synthetic distortion data addresses dynamic artifacts; Our diffusion forcing framework with non-decreasing noise schedules enables long-video synthesis in an efficient search space; Final high-quality SFT refines visual fidelity. All the code and models are available at [this https URL](https://github.com/SkyworkAI/SkyReels-V2).*
|
||||
|
||||
@@ -44,93 +44,113 @@ The following SkyReels-V2 models are supported in Diffusers:
|
||||
|
||||
### A _Visual_ Demonstration
|
||||
|
||||
An example with these parameters:
|
||||
base_num_frames=97, num_frames=97, num_inference_steps=30, ar_step=5, causal_block_size=5
|
||||
The example below has the following parameters:
|
||||
|
||||
vae_scale_factor_temporal -> 4
|
||||
num_latent_frames: (97-1)//vae_scale_factor_temporal+1 = 25 frames -> 5 blocks of 5 frames each
|
||||
- `base_num_frames=97`
|
||||
- `num_frames=97`
|
||||
- `num_inference_steps=30`
|
||||
- `ar_step=5`
|
||||
- `causal_block_size=5`
|
||||
|
||||
base_num_latent_frames = (97-1)//vae_scale_factor_temporal+1 = 25 → blocks = 25//5 = 5 blocks
|
||||
This 5 blocks means the maximum context length of the model is 25 frames in the latent space.
|
||||
With `vae_scale_factor_temporal=4`, expect `5` blocks of `5` frames each as calculated by:
|
||||
|
||||
Asynchronous Processing Timeline:
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Steps: 1 6 11 16 21 26 31 36 41 46 50 │
|
||||
│ Block 1: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
│ Block 2: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
│ Block 3: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
│ Block 4: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
│ Block 5: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
`num_latent_frames: (97-1)//vae_scale_factor_temporal+1 = 25 frames -> 5 blocks of 5 frames each`
|
||||
|
||||
For Long Videos (num_frames > base_num_frames):
|
||||
base_num_frames acts as the "sliding window size" for processing long videos.
|
||||
And the maximum context length in the latent space is calculated with `base_num_latent_frames`:
|
||||
|
||||
Example: 257-frame video with base_num_frames=97, overlap_history=17
|
||||
┌──── Iteration 1 (frames 1-97) ────┐
|
||||
│ Processing window: 97 frames │ → 5 blocks, async processing
|
||||
│ Generates: frames 1-97 │
|
||||
└───────────────────────────────────┘
|
||||
┌────── Iteration 2 (frames 81-177) ──────┐
|
||||
│ Processing window: 97 frames │
|
||||
│ Overlap: 17 frames (81-97) from prev │ → 5 blocks, async processing
|
||||
│ Generates: frames 98-177 │
|
||||
└─────────────────────────────────────────┘
|
||||
┌────── Iteration 3 (frames 161-257) ──────┐
|
||||
│ Processing window: 97 frames │
|
||||
│ Overlap: 17 frames (161-177) from prev │ → 5 blocks, async processing
|
||||
│ Generates: frames 178-257 │
|
||||
└──────────────────────────────────────────┘
|
||||
`base_num_latent_frames = (97-1)//vae_scale_factor_temporal+1 = 25 -> 25//5 = 5 blocks`
|
||||
|
||||
Each iteration independently runs the asynchronous processing with its own 5 blocks.
|
||||
base_num_frames controls:
|
||||
1. Memory usage (larger window = more VRAM)
|
||||
2. Model context length (must match training constraints)
|
||||
3. Number of blocks per iteration (base_num_latent_frames // causal_block_size)
|
||||
Asynchronous Processing Timeline:
|
||||
```text
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ Steps: 1 6 11 16 21 26 31 36 41 46 50 │
|
||||
│ Block 1: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
│ Block 2: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
│ Block 3: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
│ Block 4: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
│ Block 5: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
Each block takes 30 steps to complete denoising.
|
||||
Block N starts at step: 1 + (N-1) x ar_step
|
||||
Total steps: 30 + (5-1) x 5 = 50 steps
|
||||
For Long Videos (`num_frames` > `base_num_frames`):
|
||||
`base_num_frames` acts as the "sliding window size" for processing long videos.
|
||||
|
||||
Example: `257`-frame video with `base_num_frames=97`, `overlap_history=17`
|
||||
```text
|
||||
┌──── Iteration 1 (frames 1-97) ────┐
|
||||
│ Processing window: 97 frames │ → 5 blocks,
|
||||
│ Generates: frames 1-97 │ async processing
|
||||
└───────────────────────────────────┘
|
||||
┌────── Iteration 2 (frames 81-177) ──────┐
|
||||
│ Processing window: 97 frames │
|
||||
│ Overlap: 17 frames (81-97) from prev │ → 5 blocks,
|
||||
│ Generates: frames 98-177 │ async processing
|
||||
└─────────────────────────────────────────┘
|
||||
┌────── Iteration 3 (frames 161-257) ──────┐
|
||||
│ Processing window: 97 frames │
|
||||
│ Overlap: 17 frames (161-177) from prev │ → 5 blocks,
|
||||
│ Generates: frames 178-257 │ async processing
|
||||
└──────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
Each iteration independently runs the asynchronous processing with its own `5` blocks.
|
||||
`base_num_frames` controls:
|
||||
1. Memory usage (larger window = more VRAM)
|
||||
2. Model context length (must match training constraints)
|
||||
3. Number of blocks per iteration (`base_num_latent_frames // causal_block_size`)
|
||||
|
||||
Each block takes `30` steps to complete denoising.
|
||||
Block N starts at step: `1 + (N-1) x ar_step`
|
||||
Total steps: `30 + (5-1) x 5 = 50` steps
|
||||
|
||||
|
||||
Synchronous mode (ar_step=0) would process all blocks/frames simultaneously:
|
||||
┌──────────────────────────────────────────────┐
|
||||
│ Steps: 1 ... 30 │
|
||||
│ All blocks: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
└──────────────────────────────────────────────┘
|
||||
Total steps: 30 steps
|
||||
Synchronous mode (`ar_step=0`) would process all blocks/frames simultaneously:
|
||||
```text
|
||||
┌──────────────────────────────────────────────┐
|
||||
│ Steps: 1 ... 30 │
|
||||
│ All blocks: [■■■■■■■■■■■■■■■■■■■■■■■■■■■■■■] │
|
||||
└──────────────────────────────────────────────┘
|
||||
```
|
||||
Total steps: `30` steps
|
||||
|
||||
|
||||
An example on how the step matrix is constructed for asynchronous processing:
|
||||
Given the parameters: (num_inference_steps=30, flow_shift=8, num_frames=97, ar_step=5, causal_block_size=5)
|
||||
- num_latent_frames = (97 frames - 1) // (4 temporal downsampling) + 1 = 25
|
||||
- step_template = [999, 995, 991, 986, 980, 975, 969, 963, 956, 948,
|
||||
941, 932, 922, 912, 901, 888, 874, 859, 841, 822,
|
||||
799, 773, 743, 708, 666, 615, 551, 470, 363, 216]
|
||||
An example on how the step matrix is constructed for asynchronous processing:
|
||||
Given the parameters: (`num_inference_steps=30, flow_shift=8, num_frames=97, ar_step=5, causal_block_size=5`)
|
||||
```
|
||||
- num_latent_frames = (97 frames - 1) // (4 temporal downsampling) + 1 = 25
|
||||
- step_template = [999, 995, 991, 986, 980, 975, 969, 963, 956, 948,
|
||||
941, 932, 922, 912, 901, 888, 874, 859, 841, 822,
|
||||
799, 773, 743, 708, 666, 615, 551, 470, 363, 216]
|
||||
```
|
||||
|
||||
The algorithm creates a 50x25 step_matrix where:
|
||||
- Row 1: [999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999]
|
||||
- Row 2: [995, 995, 995, 995, 995, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999]
|
||||
- Row 3: [991, 991, 991, 991, 991, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999]
|
||||
- ...
|
||||
- Row 7: [969, 969, 969, 969, 969, 995, 995, 995, 995, 995, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999, 999]
|
||||
- ...
|
||||
- Row 21: [799, 799, 799, 799, 799, 888, 888, 888, 888, 888, 941, 941, 941, 941, 941, 975, 975, 975, 975, 975, 999, 999, 999, 999, 999]
|
||||
- ...
|
||||
- Row 35: [ 0, 0, 0, 0, 0, 216, 216, 216, 216, 216, 666, 666, 666, 666, 666, 822, 822, 822, 822, 822, 901, 901, 901, 901, 901]
|
||||
- ...
|
||||
- Row 42: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 551, 551, 551, 551, 551, 773, 773, 773, 773, 773]
|
||||
- ...
|
||||
- Row 50: [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 216, 216, 216, 216, 216]
|
||||
The algorithm creates a `50x25` `step_matrix` where:
|
||||
```
|
||||
- Row 1: [999×5, 999×5, 999×5, 999×5, 999×5]
|
||||
- Row 2: [995×5, 999×5, 999×5, 999×5, 999×5]
|
||||
- Row 3: [991×5, 999×5, 999×5, 999×5, 999×5]
|
||||
- ...
|
||||
- Row 7: [969×5, 995×5, 999×5, 999×5, 999×5]
|
||||
- ...
|
||||
- Row 21: [799×5, 888×5, 941×5, 975×5, 999×5]
|
||||
- ...
|
||||
- Row 35: [ 0×5, 216×5, 666×5, 822×5, 901×5]
|
||||
- ...
|
||||
- Row 42: [ 0×5, 0×5, 0×5, 551×5, 773×5]
|
||||
- ...
|
||||
- Row 50: [ 0×5, 0×5, 0×5, 0×5, 216×5]
|
||||
```
|
||||
|
||||
Detailed Row 6 Analysis:
|
||||
- step_matrix[5]: [ 975, 975, 975, 975, 975, 999, 999, 999, 999, 999, 999, ..., 999]
|
||||
- step_index[5]: [ 6, 6, 6, 6, 6, 1, 1, 1, 1, 1, 0, ..., 0]
|
||||
- step_update_mask[5]: [True,True,True,True,True,True,True,True,True,True,False, ...,False]
|
||||
- valid_interval[5]: (0, 25)
|
||||
Detailed Row `6` Analysis:
|
||||
```
|
||||
- step_matrix[5]: [ 975×5, 999×5, 999×5, 999×5, 999×5]
|
||||
- step_index[5]: [ 6×5, 1×5, 0×5, 0×5, 0×5]
|
||||
- step_update_mask[5]: [True×5, True×5, False×5, False×5, False×5]
|
||||
- valid_interval[5]: (0, 25)
|
||||
```
|
||||
|
||||
Key Pattern: Block `i` lags behind Block `i-1` by exactly `ar_step=5` timesteps, creating the
|
||||
staggered "diffusion forcing" effect where later blocks condition on cleaner earlier blocks.
|
||||
|
||||
Key Pattern: Block i lags behind Block i-1 by exactly ar_step=5 timesteps, creating the
|
||||
staggered "diffusion forcing" effect where later blocks condition on cleaner earlier blocks.
|
||||
|
||||
### Text-to-Video Generation
|
||||
|
||||
@@ -145,23 +165,22 @@ From the original repo:
|
||||
>You can use --ar_step 5 to enable asynchronous inference. When asynchronous inference, --causal_block_size 5 is recommended while it is not supposed to be set for synchronous generation... Asynchronous inference will take more steps to diffuse the whole sequence which means it will be SLOWER than synchronous mode. In our experiments, asynchronous inference may improve the instruction following and visual consistent performance.
|
||||
|
||||
```py
|
||||
# pip install ftfy
|
||||
import torch
|
||||
from diffusers import AutoModel, SkyReelsV2DiffusionForcingPipeline, UniPCMultistepScheduler
|
||||
from diffusers.utils import export_to_video
|
||||
|
||||
vae = AutoModel.from_pretrained("Skywork/SkyReels-V2-DF-14B-540P-Diffusers", subfolder="vae", torch_dtype=torch.float32)
|
||||
transformer = AutoModel.from_pretrained("Skywork/SkyReels-V2-DF-14B-540P-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
|
||||
|
||||
model_id = "Skywork/SkyReels-V2-DF-1.3B-540P-Diffusers"
|
||||
vae = AutoModel.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
||||
|
||||
pipeline = SkyReelsV2DiffusionForcingPipeline.from_pretrained(
|
||||
"Skywork/SkyReels-V2-DF-14B-540P-Diffusers",
|
||||
model_id,
|
||||
vae=vae,
|
||||
transformer=transformer,
|
||||
torch_dtype=torch.bfloat16
|
||||
torch_dtype=torch.bfloat16,
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
flow_shift = 8.0 # 8.0 for T2V, 5.0 for I2V
|
||||
pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config, flow_shift=flow_shift)
|
||||
pipeline = pipeline.to("cuda")
|
||||
|
||||
prompt = "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
|
||||
|
||||
@@ -177,7 +196,7 @@ output = pipeline(
|
||||
overlap_history=None, # Number of frames to overlap for smooth transitions in long videos; 17 for long video generations
|
||||
addnoise_condition=20, # Improves consistency in long video generation
|
||||
).frames[0]
|
||||
export_to_video(output, "T2V.mp4", fps=24, quality=8)
|
||||
export_to_video(output, "video.mp4", fps=24, quality=8)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -198,14 +217,14 @@ from diffusers import AutoencoderKLWan, SkyReelsV2DiffusionForcingImageToVideoPi
|
||||
from diffusers.utils import export_to_video, load_image
|
||||
|
||||
|
||||
model_id = "Skywork/SkyReels-V2-DF-14B-720P-Diffusers"
|
||||
model_id = "Skywork/SkyReels-V2-DF-1.3B-720P-Diffusers"
|
||||
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
||||
pipeline = SkyReelsV2DiffusionForcingImageToVideoPipeline.from_pretrained(
|
||||
model_id, vae=vae, torch_dtype=torch.bfloat16
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
flow_shift = 5.0 # 8.0 for T2V, 5.0 for I2V
|
||||
pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config, flow_shift=flow_shift)
|
||||
pipeline.to("cuda")
|
||||
|
||||
first_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png")
|
||||
last_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png")
|
||||
@@ -239,7 +258,7 @@ prompt = "CG animation style, a small blue bird takes off from the ground, flapp
|
||||
output = pipeline(
|
||||
image=first_frame, last_image=last_frame, prompt=prompt, height=height, width=width, guidance_scale=5.0
|
||||
).frames[0]
|
||||
export_to_video(output, "output.mp4", fps=24, quality=8)
|
||||
export_to_video(output, "video.mp4", fps=24, quality=8)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
@@ -261,75 +280,35 @@ from diffusers import AutoencoderKLWan, SkyReelsV2DiffusionForcingVideoToVideoPi
|
||||
from diffusers.utils import export_to_video, load_video
|
||||
|
||||
|
||||
model_id = "Skywork/SkyReels-V2-DF-14B-540P-Diffusers"
|
||||
model_id = "Skywork/SkyReels-V2-DF-1.3B-720P-Diffusers"
|
||||
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
||||
pipeline = SkyReelsV2DiffusionForcingVideoToVideoPipeline.from_pretrained(
|
||||
model_id, vae=vae, torch_dtype=torch.bfloat16
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
flow_shift = 5.0 # 8.0 for T2V, 5.0 for I2V
|
||||
pipeline.scheduler = UniPCMultistepScheduler.from_config(pipeline.scheduler.config, flow_shift=flow_shift)
|
||||
pipeline.to("cuda")
|
||||
|
||||
video = load_video("input_video.mp4")
|
||||
|
||||
prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."
|
||||
|
||||
output = pipeline(
|
||||
video=video, prompt=prompt, height=544, width=960, guidance_scale=5.0,
|
||||
num_inference_steps=30, num_frames=257, base_num_frames=97#, ar_step=5, causal_block_size=5,
|
||||
video=video, prompt=prompt, height=720, width=1280, guidance_scale=5.0, overlap_history=17,
|
||||
num_inference_steps=30, num_frames=257, base_num_frames=121#, ar_step=5, causal_block_size=5,
|
||||
).frames[0]
|
||||
export_to_video(output, "output.mp4", fps=24, quality=8)
|
||||
# Total frames will be the number of frames of given video + 257
|
||||
export_to_video(output, "video.mp4", fps=24, quality=8)
|
||||
# Total frames will be the number of frames of the given video + 257
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
|
||||
## Notes
|
||||
|
||||
- SkyReels-V2 supports LoRAs with [`~loaders.SkyReelsV2LoraLoaderMixin.load_lora_weights`].
|
||||
|
||||
<details>
|
||||
<summary>Show example code</summary>
|
||||
|
||||
```py
|
||||
# pip install ftfy
|
||||
import torch
|
||||
from diffusers import AutoModel, SkyReelsV2DiffusionForcingPipeline
|
||||
from diffusers.utils import export_to_video
|
||||
|
||||
vae = AutoModel.from_pretrained(
|
||||
"Skywork/SkyReels-V2-DF-1.3B-540P-Diffusers", subfolder="vae", torch_dtype=torch.float32
|
||||
)
|
||||
pipeline = SkyReelsV2DiffusionForcingPipeline.from_pretrained(
|
||||
"Skywork/SkyReels-V2-DF-1.3B-540P-Diffusers", vae=vae, torch_dtype=torch.bfloat16
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
|
||||
pipeline.load_lora_weights("benjamin-paine/steamboat-willie-1.3b", adapter_name="steamboat-willie")
|
||||
pipeline.set_adapters("steamboat-willie")
|
||||
|
||||
pipeline.enable_model_cpu_offload()
|
||||
|
||||
# use "steamboat willie style" to trigger the LoRA
|
||||
prompt = """
|
||||
steamboat willie style, golden era animation, The camera rushes from far to near in a low-angle shot,
|
||||
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
|
||||
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
|
||||
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
|
||||
shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
|
||||
"""
|
||||
|
||||
output = pipeline(
|
||||
prompt=prompt,
|
||||
num_frames=97,
|
||||
guidance_scale=6.0,
|
||||
).frames[0]
|
||||
export_to_video(output, "output.mp4", fps=24)
|
||||
```
|
||||
|
||||
</details>
|
||||
`SkyReelsV2Pipeline` and `SkyReelsV2ImageToVideoPipeline` are also available without Diffusion Forcing framework applied.
|
||||
|
||||
|
||||
## SkyReelsV2DiffusionForcingPipeline
|
||||
@@ -364,4 +343,4 @@ export_to_video(output, "output.mp4", fps=24, quality=8)
|
||||
|
||||
## SkyReelsV2PipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.skyreels_v2.pipeline_output.SkyReelsV2PipelineOutput
|
||||
[[autodoc]] pipelines.skyreels_v2.pipeline_output.SkyReelsV2PipelineOutput
|
||||
|
||||
@@ -47,13 +47,3 @@ Make sure to check out the Stable Diffusion [Tips](overview#tips) section to lea
|
||||
## StableDiffusionPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
|
||||
|
||||
## FlaxStableDiffusionImg2ImgPipeline
|
||||
|
||||
[[autodoc]] FlaxStableDiffusionImg2ImgPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## FlaxStableDiffusionPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput
|
||||
|
||||
@@ -49,13 +49,3 @@ If you're interested in using one of the official checkpoints for a task, explor
|
||||
## StableDiffusionPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
|
||||
|
||||
## FlaxStableDiffusionInpaintPipeline
|
||||
|
||||
[[autodoc]] FlaxStableDiffusionInpaintPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## FlaxStableDiffusionPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput
|
||||
|
||||
@@ -51,13 +51,3 @@ If you're interested in using one of the official checkpoints for a task, explor
|
||||
## StableDiffusionPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.stable_diffusion.StableDiffusionPipelineOutput
|
||||
|
||||
## FlaxStableDiffusionPipeline
|
||||
|
||||
[[autodoc]] FlaxStableDiffusionPipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## FlaxStableDiffusionPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.stable_diffusion.FlaxStableDiffusionPipelineOutput
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
# Wan2.1
|
||||
# Wan
|
||||
|
||||
[Wan-2.1](https://huggingface.co/papers/2503.20314) by the Wan Team.
|
||||
|
||||
@@ -42,7 +42,7 @@ The following Wan models are supported in Diffusers:
|
||||
- [Wan 2.2 TI2V 5B](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B-Diffusers)
|
||||
|
||||
> [!TIP]
|
||||
> Click on the Wan2.1 models in the right sidebar for more examples of video generation.
|
||||
> Click on the Wan models in the right sidebar for more examples of video generation.
|
||||
|
||||
### Text-to-Video Generation
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Installation
|
||||
|
||||
Diffusers is tested on Python 3.8+, PyTorch 1.4+, and Flax 0.4.1+. Follow the installation instructions for the deep learning library you're using, [PyTorch](https://pytorch.org/get-started/locally/) or [Flax](https://flax.readthedocs.io/en/latest/).
|
||||
Diffusers is tested on Python 3.8+ and PyTorch 1.4+. Install [PyTorch](https://pytorch.org/get-started/locally/) according to your system and setup.
|
||||
|
||||
Create a [virtual environment](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) for easier management of separate projects and to avoid compatibility issues between dependencies. Use [uv](https://docs.astral.sh/uv/), a Rust-based Python package and project manager, to create a virtual environment and install Diffusers.
|
||||
|
||||
@@ -32,12 +32,6 @@ PyTorch only supports Python 3.8 - 3.11 on Windows.
|
||||
uv pip install diffusers["torch"] transformers
|
||||
```
|
||||
|
||||
Use the command below for Flax.
|
||||
|
||||
```bash
|
||||
uv pip install diffusers["flax"] transformers
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="conda">
|
||||
|
||||
@@ -71,27 +65,12 @@ An editable install is recommended for development workflows or if you're using
|
||||
|
||||
Clone the repository and install Diffusers with the following commands.
|
||||
|
||||
<hfoptions id="editable">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/diffusers.git
|
||||
cd diffusers
|
||||
uv pip install -e ".[torch]"
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Flax">
|
||||
|
||||
```bash
|
||||
git clone https://github.com/huggingface/diffusers.git
|
||||
cd diffusers
|
||||
uv pip install -e ".[flax]"
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
> [!WARNING]
|
||||
> You must keep the `diffusers` folder if you want to keep using the library with the editable install.
|
||||
|
||||
@@ -140,7 +119,7 @@ For more details about managing and cleaning the cache, take a look at the [Unde
|
||||
## Telemetry logging
|
||||
|
||||
Diffusers gathers telemetry information during [`~DiffusionPipeline.from_pretrained`] requests.
|
||||
The data gathered includes the Diffusers and PyTorch/Flax version, the requested model or pipeline class,
|
||||
The data gathered includes the Diffusers and PyTorch version, the requested model or pipeline class,
|
||||
and the path to a pretrained checkpoint if it is hosted on the Hub.
|
||||
|
||||
This usage data helps us debug issues and prioritize new features.
|
||||
|
||||
@@ -209,7 +209,7 @@ There is also a [compile_regions](https://github.com/huggingface/accelerate/blob
|
||||
# pip install -U accelerate
|
||||
import torch
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
from accelerate.utils import compile regions
|
||||
from accelerate.utils import compile_regions
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16
|
||||
|
||||
@@ -10,7 +10,7 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Compile and offloading quantized models
|
||||
# Compiling and offloading quantized models
|
||||
|
||||
Optimizing models often involves trade-offs between [inference speed](./fp16) and [memory-usage](./memory). For instance, while [caching](./cache) can boost inference speed, it also increases memory consumption since it needs to store the outputs of intermediate attention layers. A more balanced optimization strategy combines quantizing a model, [torch.compile](./fp16#torchcompile) and various [offloading methods](./memory#offloading).
|
||||
|
||||
@@ -28,7 +28,8 @@ The table below provides a comparison of optimization strategy combinations and
|
||||
| quantization | 32.602 | 14.9453 |
|
||||
| quantization, torch.compile | 25.847 | 14.9448 |
|
||||
| quantization, torch.compile, model CPU offloading | 32.312 | 12.2369 |
|
||||
<small>These results are benchmarked on Flux with a RTX 4090. The transformer and text_encoder components are quantized. Refer to the [benchmarking script](https://gist.github.com/sayakpaul/0db9d8eeeb3d2a0e5ed7cf0d9ca19b7d) if you're interested in evaluating your own model.</small>
|
||||
|
||||
<small>These results are benchmarked on Flux with a RTX 4090. The transformer and text_encoder components are quantized. Refer to the <a href="https://gist.github.com/sayakpaul/0db9d8eeeb3d2a0e5ed7cf0d9ca19b7d">benchmarking script</a> if you're interested in evaluating your own model.</small>
|
||||
|
||||
This guide will show you how to compile and offload a quantized model with [bitsandbytes](../quantization/bitsandbytes#torchcompile). Make sure you are using [PyTorch nightly](https://pytorch.org/get-started/locally/) and the latest version of bitsandbytes.
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
[ControlNet](https://hf.co/papers/2302.05543) models are adapters trained on top of another pretrained model. It allows for a greater degree of control over image generation by conditioning the model with an additional input image. The input image can be a canny edge, depth map, human pose, and many more.
|
||||
|
||||
If you're training on a GPU with limited vRAM, you should try enabling the `gradient_checkpointing`, `gradient_accumulation_steps`, and `mixed_precision` parameters in the training command. You can also reduce your memory footprint by using memory-efficient attention with [xFormers](../optimization/xformers). JAX/Flax training is also supported for efficient training on TPUs and GPUs, but it doesn't support gradient checkpointing or xFormers. You should have a GPU with >30GB of memory if you want to train faster with Flax.
|
||||
If you're training on a GPU with limited vRAM, you should try enabling the `gradient_checkpointing`, `gradient_accumulation_steps`, and `mixed_precision` parameters in the training command. You can also reduce your memory footprint by using memory-efficient attention with [xFormers](../optimization/xformers).
|
||||
|
||||
This guide will explore the [train_controlnet.py](https://github.com/huggingface/diffusers/blob/main/examples/controlnet/train_controlnet.py) training script to help you become familiar with it, and how you can adapt it for your own use-case.
|
||||
|
||||
@@ -28,45 +28,10 @@ pip install .
|
||||
|
||||
Then navigate to the example folder containing the training script and install the required dependencies for the script you're using:
|
||||
|
||||
<hfoptions id="installation">
|
||||
<hfoption id="PyTorch">
|
||||
```bash
|
||||
cd examples/controlnet
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
</hfoption>
|
||||
<hfoption id="Flax">
|
||||
|
||||
If you have access to a TPU, the Flax training script runs even faster! Let's run the training script on the [Google Cloud TPU VM](https://cloud.google.com/tpu/docs/run-calculation-jax). Create a single TPU v4-8 VM and connect to it:
|
||||
|
||||
```bash
|
||||
ZONE=us-central2-b
|
||||
TPU_TYPE=v4-8
|
||||
VM_NAME=hg_flax
|
||||
|
||||
gcloud alpha compute tpus tpu-vm create $VM_NAME \
|
||||
--zone $ZONE \
|
||||
--accelerator-type $TPU_TYPE \
|
||||
--version tpu-vm-v4-base
|
||||
|
||||
gcloud alpha compute tpus tpu-vm ssh $VM_NAME --zone $ZONE -- \
|
||||
```
|
||||
|
||||
Install JAX 0.4.5:
|
||||
|
||||
```bash
|
||||
pip install "jax[tpu]==0.4.5" -f https://storage.googleapis.com/jax-releases/libtpu_releases.html
|
||||
```
|
||||
|
||||
Then install the required dependencies for the Flax script:
|
||||
|
||||
```bash
|
||||
cd examples/controlnet
|
||||
pip install -r requirements_flax.txt
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
<Tip>
|
||||
|
||||
@@ -120,7 +85,7 @@ Many of the basic and important parameters are described in the [Text-to-image](
|
||||
|
||||
### Min-SNR weighting
|
||||
|
||||
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch and is unavailable in the Flax training script.
|
||||
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch.
|
||||
|
||||
Add the `--snr_gamma` parameter and set it to the recommended value of 5.0:
|
||||
|
||||
@@ -272,9 +237,6 @@ That's it! You don't need to add any additional parameters to your training comm
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
<hfoptions id="training-inference">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```bash
|
||||
export MODEL_DIR="stable-diffusion-v1-5/stable-diffusion-v1-5"
|
||||
export OUTPUT_DIR="path/to/save/model"
|
||||
@@ -292,47 +254,6 @@ accelerate launch train_controlnet.py \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Flax">
|
||||
|
||||
With Flax, you can [profile your code](https://jax.readthedocs.io/en/latest/profiling.html) by adding the `--profile_steps==5` parameter to your training command. Install the Tensorboard profile plugin:
|
||||
|
||||
```bash
|
||||
pip install tensorflow tensorboard-plugin-profile
|
||||
tensorboard --logdir runs/fill-circle-100steps-20230411_165612/
|
||||
```
|
||||
|
||||
Then you can inspect the profile at [http://localhost:6006/#profile](http://localhost:6006/#profile).
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
If you run into version conflicts with the plugin, try uninstalling and reinstalling all versions of TensorFlow and Tensorboard. The debugging functionality of the profile plugin is still experimental, and not all views are fully functional. The `trace_viewer` cuts off events after 1M, which can result in all your device traces getting lost if for example, you profile the compilation step by accident.
|
||||
|
||||
</Tip>
|
||||
|
||||
```bash
|
||||
python3 train_controlnet_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--dataset_name=fusing/fill50k \
|
||||
--resolution=512 \
|
||||
--learning_rate=1e-5 \
|
||||
--validation_image "./conditioning_image_1.png" "./conditioning_image_2.png" \
|
||||
--validation_prompt "red circle with blue background" "cyan circle with brown floral background" \
|
||||
--validation_steps=1000 \
|
||||
--train_batch_size=2 \
|
||||
--revision="non-ema" \
|
||||
--from_pt \
|
||||
--report_to="wandb" \
|
||||
--tracker_project_name=$HUB_MODEL_ID \
|
||||
--num_train_epochs=11 \
|
||||
--push_to_hub \
|
||||
--hub_model_id=$HUB_MODEL_ID
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
Once training is complete, you can use your newly trained model for inference!
|
||||
|
||||
```py
|
||||
|
||||
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
[DreamBooth](https://huggingface.co/papers/2208.12242) is a training technique that updates the entire diffusion model by training on just a few images of a subject or style. It works by associating a special word in the prompt with the example images.
|
||||
|
||||
If you're training on a GPU with limited vRAM, you should try enabling the `gradient_checkpointing` and `mixed_precision` parameters in the training command. You can also reduce your memory footprint by using memory-efficient attention with [xFormers](../optimization/xformers). JAX/Flax training is also supported for efficient training on TPUs and GPUs, but it doesn't support gradient checkpointing or xFormers. You should have a GPU with >30GB of memory if you want to train faster with Flax.
|
||||
If you're training on a GPU with limited vRAM, you should try enabling the `gradient_checkpointing` and `mixed_precision` parameters in the training command. You can also reduce your memory footprint by using memory-efficient attention with [xFormers](../optimization/xformers).
|
||||
|
||||
This guide will explore the [train_dreambooth.py](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py) script to help you become more familiar with it, and how you can adapt it for your own use-case.
|
||||
|
||||
@@ -28,25 +28,11 @@ pip install .
|
||||
|
||||
Navigate to the example folder with the training script and install the required dependencies for the script you're using:
|
||||
|
||||
<hfoptions id="installation">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```bash
|
||||
cd examples/dreambooth
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Flax">
|
||||
|
||||
```bash
|
||||
cd examples/dreambooth
|
||||
pip install -r requirements_flax.txt
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
<Tip>
|
||||
|
||||
🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
|
||||
@@ -110,7 +96,7 @@ Some basic and important parameters to know and specify are:
|
||||
|
||||
### Min-SNR weighting
|
||||
|
||||
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch and is unavailable in the Flax training script.
|
||||
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch.
|
||||
|
||||
Add the `--snr_gamma` parameter and set it to the recommended value of 5.0:
|
||||
|
||||
@@ -311,9 +297,6 @@ That's it! You don't need to add any additional parameters to your training comm
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
<hfoptions id="training-inference">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
|
||||
export INSTANCE_DIR="./dog"
|
||||
@@ -334,29 +317,6 @@ accelerate launch train_dreambooth.py \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Flax">
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
|
||||
export INSTANCE_DIR="./dog"
|
||||
export OUTPUT_DIR="path-to-save-model"
|
||||
|
||||
python train_dreambooth_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
--instance_prompt="a photo of sks dog" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--learning_rate=5e-6 \
|
||||
--max_train_steps=400 \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
Once training is complete, you can use your newly trained model for inference!
|
||||
|
||||
<Tip>
|
||||
@@ -383,9 +343,6 @@ image.save("dog-bucket.png")
|
||||
|
||||
</Tip>
|
||||
|
||||
<hfoptions id="training-inference">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
@@ -395,39 +352,6 @@ image = pipeline("A photo of sks dog in a bucket", num_inference_steps=50, guida
|
||||
image.save("dog-bucket.png")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Flax">
|
||||
|
||||
```py
|
||||
import jax
|
||||
import numpy as np
|
||||
from flax.jax_utils import replicate
|
||||
from flax.training.common_utils import shard
|
||||
from diffusers import FlaxStableDiffusionPipeline
|
||||
|
||||
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained("path-to-your-trained-model", dtype=jax.numpy.bfloat16)
|
||||
|
||||
prompt = "A photo of sks dog in a bucket"
|
||||
prng_seed = jax.random.PRNGKey(0)
|
||||
num_inference_steps = 50
|
||||
|
||||
num_samples = jax.device_count()
|
||||
prompt = num_samples * [prompt]
|
||||
prompt_ids = pipeline.prepare_inputs(prompt)
|
||||
|
||||
# shard inputs and rng
|
||||
params = replicate(params)
|
||||
prng_seed = jax.random.split(prng_seed, jax.device_count())
|
||||
prompt_ids = shard(prompt_ids)
|
||||
|
||||
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
|
||||
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
|
||||
image.save("dog-bucket.png")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## LoRA
|
||||
|
||||
LoRA is a training technique for significantly reducing the number of trainable parameters. As a result, training is faster and it is easier to store the resulting weights because they are a lot smaller (~100MBs). Use the [train_dreambooth_lora.py](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth_lora.py) script to train with LoRA.
|
||||
|
||||
@@ -88,7 +88,7 @@ Most of the parameters are identical to the parameters in the [Text-to-image](te
|
||||
|
||||
### Min-SNR weighting
|
||||
|
||||
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch and is unavailable in the Flax training script.
|
||||
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch.
|
||||
|
||||
Add the `--snr_gamma` parameter and set it to the recommended value of 5.0:
|
||||
|
||||
|
||||
@@ -38,25 +38,11 @@ pip install .
|
||||
|
||||
Navigate to the example folder with the training script and install the required dependencies for the script you're using:
|
||||
|
||||
<hfoptions id="installation">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```bash
|
||||
cd examples/text_to_image
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Flax">
|
||||
|
||||
```bash
|
||||
cd examples/text_to_image
|
||||
pip install -r requirements_flax.txt
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
<Tip>
|
||||
|
||||
🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
|
||||
|
||||
@@ -23,18 +23,18 @@ Each training script is:
|
||||
|
||||
Our current collection of training scripts include:
|
||||
|
||||
| Training | SDXL-support | LoRA-support | Flax-support |
|
||||
|---|---|---|---|
|
||||
| [unconditional image generation](https://github.com/huggingface/diffusers/tree/main/examples/unconditional_image_generation) [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) | | | |
|
||||
| [text-to-image](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image) | 👍 | 👍 | 👍 |
|
||||
| [textual inversion](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion) [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb) | | | 👍 |
|
||||
| [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb) | 👍 | 👍 | 👍 |
|
||||
| [ControlNet](https://github.com/huggingface/diffusers/tree/main/examples/controlnet) | 👍 | | 👍 |
|
||||
| [InstructPix2Pix](https://github.com/huggingface/diffusers/tree/main/examples/instruct_pix2pix) | 👍 | | |
|
||||
| [Custom Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/custom_diffusion) | | | |
|
||||
| [T2I-Adapters](https://github.com/huggingface/diffusers/tree/main/examples/t2i_adapter) | 👍 | | |
|
||||
| [Kandinsky 2.2](https://github.com/huggingface/diffusers/tree/main/examples/kandinsky2_2/text_to_image) | | 👍 | |
|
||||
| [Wuerstchen](https://github.com/huggingface/diffusers/tree/main/examples/wuerstchen/text_to_image) | | 👍 | |
|
||||
| Training | SDXL-support | LoRA-support |
|
||||
|---|---|---|
|
||||
| [unconditional image generation](https://github.com/huggingface/diffusers/tree/main/examples/unconditional_image_generation) [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/training_example.ipynb) | | |
|
||||
| [text-to-image](https://github.com/huggingface/diffusers/tree/main/examples/text_to_image) | 👍 | 👍 |
|
||||
| [textual inversion](https://github.com/huggingface/diffusers/tree/main/examples/textual_inversion) [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_textual_inversion_training.ipynb) | | |
|
||||
| [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth) [](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/sd_dreambooth_training.ipynb) | 👍 | 👍 |
|
||||
| [ControlNet](https://github.com/huggingface/diffusers/tree/main/examples/controlnet) | 👍 | |
|
||||
| [InstructPix2Pix](https://github.com/huggingface/diffusers/tree/main/examples/instruct_pix2pix) | 👍 | |
|
||||
| [Custom Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/custom_diffusion) | | |
|
||||
| [T2I-Adapters](https://github.com/huggingface/diffusers/tree/main/examples/t2i_adapter) | 👍 | |
|
||||
| [Kandinsky 2.2](https://github.com/huggingface/diffusers/tree/main/examples/kandinsky2_2/text_to_image) | | 👍 |
|
||||
| [Wuerstchen](https://github.com/huggingface/diffusers/tree/main/examples/wuerstchen/text_to_image) | | 👍 |
|
||||
|
||||
These examples are **actively** maintained, so please feel free to open an issue if they aren't working as expected. If you feel like another training example should be included, you're more than welcome to start a [Feature Request](https://github.com/huggingface/diffusers/issues/new?assignees=&labels=&template=feature_request.md&title=) to discuss your feature idea with us and whether it meets our criteria of being self-contained, easy-to-tweak, beginner-friendly, and single-purpose.
|
||||
|
||||
@@ -48,7 +48,7 @@ cd diffusers
|
||||
pip install .
|
||||
```
|
||||
|
||||
Then navigate to the folder of the training script (for example, [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth)) and install the `requirements.txt` file. Some training scripts have a specific requirement file for SDXL, LoRA or Flax. If you're using one of these scripts, make sure you install its corresponding requirements file.
|
||||
Then navigate to the folder of the training script (for example, [DreamBooth](https://github.com/huggingface/diffusers/tree/main/examples/dreambooth)) and install the `requirements.txt` file. Some training scripts have a specific requirement file for SDXL or LoRA. If you're using one of these scripts, make sure you install its corresponding requirements file.
|
||||
|
||||
```bash
|
||||
cd examples/dreambooth
|
||||
|
||||
@@ -96,7 +96,7 @@ Most of the parameters are identical to the parameters in the [Text-to-image](te
|
||||
|
||||
### Min-SNR weighting
|
||||
|
||||
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting either `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch and is unavailable in the Flax training script.
|
||||
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting either `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch.
|
||||
|
||||
Add the `--snr_gamma` parameter and set it to the recommended value of 5.0:
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ The text-to-image script is experimental, and it's easy to overfit and run into
|
||||
|
||||
Text-to-image models like Stable Diffusion are conditioned to generate images given a text prompt.
|
||||
|
||||
Training a model can be taxing on your hardware, but if you enable `gradient_checkpointing` and `mixed_precision`, it is possible to train a model on a single 24GB GPU. If you're training with larger batch sizes or want to train faster, it's better to use GPUs with more than 30GB of memory. You can reduce your memory footprint by enabling memory-efficient attention with [xFormers](../optimization/xformers). JAX/Flax training is also supported for efficient training on TPUs and GPUs, but it doesn't support gradient checkpointing, gradient accumulation or xFormers. A GPU with at least 30GB of memory or a TPU v3 is recommended for training with Flax.
|
||||
Training a model can be taxing on your hardware, but if you enable `gradient_checkpointing` and `mixed_precision`, it is possible to train a model on a single 24GB GPU. If you're training with larger batch sizes or want to train faster, it's better to use GPUs with more than 30GB of memory. You can reduce your memory footprint by enabling memory-efficient attention with [xFormers](../optimization/xformers).
|
||||
|
||||
This guide will explore the [train_text_to_image.py](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py) training script to help you become familiar with it, and how you can adapt it for your own use-case.
|
||||
|
||||
@@ -34,20 +34,10 @@ pip install .
|
||||
|
||||
Then navigate to the example folder containing the training script and install the required dependencies for the script you're using:
|
||||
|
||||
<hfoptions id="installation">
|
||||
<hfoption id="PyTorch">
|
||||
```bash
|
||||
cd examples/text_to_image
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
</hfoption>
|
||||
<hfoption id="Flax">
|
||||
```bash
|
||||
cd examples/text_to_image
|
||||
pip install -r requirements_flax.txt
|
||||
```
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
<Tip>
|
||||
|
||||
@@ -106,7 +96,7 @@ Some basic and important parameters include:
|
||||
|
||||
### Min-SNR weighting
|
||||
|
||||
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch and is unavailable in the Flax training script.
|
||||
The [Min-SNR](https://huggingface.co/papers/2303.09556) weighting strategy can help with training by rebalancing the loss to achieve faster convergence. The training script supports predicting `epsilon` (noise) or `v_prediction`, but Min-SNR is compatible with both prediction types. This weighting strategy is only supported by PyTorch.
|
||||
|
||||
Add the `--snr_gamma` parameter and set it to the recommended value of 5.0:
|
||||
|
||||
@@ -155,9 +145,6 @@ Lastly, the [training loop](https://github.com/huggingface/diffusers/blob/8959c5
|
||||
|
||||
Once you've made all your changes or you're okay with the default configuration, you're ready to launch the training script! 🚀
|
||||
|
||||
<hfoptions id="training-inference">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
Let's train on the [Naruto BLIP captions](https://huggingface.co/datasets/lambdalabs/naruto-blip-captions) dataset to generate your own Naruto characters. Set the environment variables `MODEL_NAME` and `dataset_name` to the model and the dataset (either from the Hub or a local path). If you're training on more than one GPU, add the `--multi_gpu` parameter to the `accelerate launch` command.
|
||||
|
||||
<Tip>
|
||||
@@ -187,43 +174,8 @@ accelerate launch --mixed_precision="fp16" train_text_to_image.py \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Flax">
|
||||
|
||||
Training with Flax can be faster on TPUs and GPUs thanks to [@duongna211](https://github.com/duongna21). Flax is more efficient on a TPU, but GPU performance is also great.
|
||||
|
||||
Set the environment variables `MODEL_NAME` and `dataset_name` to the model and the dataset (either from the Hub or a local path).
|
||||
|
||||
<Tip>
|
||||
|
||||
To train on a local dataset, set the `TRAIN_DIR` and `OUTPUT_DIR` environment variables to the path of the dataset and where to save the model to.
|
||||
|
||||
</Tip>
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
|
||||
export dataset_name="lambdalabs/naruto-blip-captions"
|
||||
|
||||
python train_text_to_image_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--dataset_name=$dataset_name \
|
||||
--resolution=512 --center_crop --random_flip \
|
||||
--train_batch_size=1 \
|
||||
--max_train_steps=15000 \
|
||||
--learning_rate=1e-05 \
|
||||
--max_grad_norm=1 \
|
||||
--output_dir="sd-naruto-model" \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
Once training is complete, you can use your newly trained model for inference:
|
||||
|
||||
<hfoptions id="training-inference">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionPipeline
|
||||
import torch
|
||||
@@ -234,39 +186,6 @@ image = pipeline(prompt="yoda").images[0]
|
||||
image.save("yoda-naruto.png")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Flax">
|
||||
|
||||
```py
|
||||
import jax
|
||||
import numpy as np
|
||||
from flax.jax_utils import replicate
|
||||
from flax.training.common_utils import shard
|
||||
from diffusers import FlaxStableDiffusionPipeline
|
||||
|
||||
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained("path/to/saved_model", dtype=jax.numpy.bfloat16)
|
||||
|
||||
prompt = "yoda naruto"
|
||||
prng_seed = jax.random.PRNGKey(0)
|
||||
num_inference_steps = 50
|
||||
|
||||
num_samples = jax.device_count()
|
||||
prompt = num_samples * [prompt]
|
||||
prompt_ids = pipeline.prepare_inputs(prompt)
|
||||
|
||||
# shard inputs and rng
|
||||
params = replicate(params)
|
||||
prng_seed = jax.random.split(prng_seed, jax.device_count())
|
||||
prompt_ids = shard(prompt_ids)
|
||||
|
||||
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
|
||||
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
|
||||
image.save("yoda-naruto.png")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Next steps
|
||||
|
||||
Congratulations on training your own text-to-image model! To learn more about how to use your new model, the following guides may be helpful:
|
||||
|
||||
@@ -14,7 +14,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
[Textual Inversion](https://hf.co/papers/2208.01618) is a training technique for personalizing image generation models with just a few example images of what you want it to learn. This technique works by learning and updating the text embeddings (the new embeddings are tied to a special word you must use in the prompt) to match the example images you provide.
|
||||
|
||||
If you're training on a GPU with limited vRAM, you should try enabling the `gradient_checkpointing` and `mixed_precision` parameters in the training command. You can also reduce your memory footprint by using memory-efficient attention with [xFormers](../optimization/xformers). JAX/Flax training is also supported for efficient training on TPUs and GPUs, but it doesn't support gradient checkpointing or xFormers. With the same configuration and setup as PyTorch, the Flax training script should be at least ~70% faster!
|
||||
If you're training on a GPU with limited vRAM, you should try enabling the `gradient_checkpointing` and `mixed_precision` parameters in the training command. You can also reduce your memory footprint by using memory-efficient attention with [xFormers](../optimization/xformers).
|
||||
|
||||
This guide will explore the [textual_inversion.py](https://github.com/huggingface/diffusers/blob/main/examples/textual_inversion/textual_inversion.py) script to help you become more familiar with it, and how you can adapt it for your own use-case.
|
||||
|
||||
@@ -28,25 +28,10 @@ pip install .
|
||||
|
||||
Navigate to the example folder with the training script and install the required dependencies for the script you're using:
|
||||
|
||||
<hfoptions id="installation">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```bash
|
||||
cd examples/textual_inversion
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Flax">
|
||||
|
||||
```bash
|
||||
cd examples/textual_inversion
|
||||
pip install -r requirements_flax.txt
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
<Tip>
|
||||
|
||||
🤗 Accelerate is a library for helping you train on multiple GPUs/TPUs or with mixed-precision. It'll automatically configure your training setup based on your hardware and environment. Take a look at the 🤗 Accelerate [Quick tour](https://huggingface.co/docs/accelerate/quicktour) to learn more.
|
||||
@@ -189,9 +174,6 @@ One more thing before you launch the script. If you're interested in following a
|
||||
--validation_steps=100
|
||||
```
|
||||
|
||||
<hfoptions id="training-inference">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="stable-diffusion-v1-5/stable-diffusion-v1-5"
|
||||
export DATA_DIR="./cat"
|
||||
@@ -214,36 +196,8 @@ accelerate launch textual_inversion.py \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Flax">
|
||||
|
||||
```bash
|
||||
export MODEL_NAME="duongna/stable-diffusion-v1-4-flax"
|
||||
export DATA_DIR="./cat"
|
||||
|
||||
python textual_inversion_flax.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--train_data_dir=$DATA_DIR \
|
||||
--learnable_property="object" \
|
||||
--placeholder_token="<cat-toy>" \
|
||||
--initializer_token="toy" \
|
||||
--resolution=512 \
|
||||
--train_batch_size=1 \
|
||||
--max_train_steps=3000 \
|
||||
--learning_rate=5.0e-04 \
|
||||
--scale_lr \
|
||||
--output_dir="textual_inversion_cat" \
|
||||
--push_to_hub
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
After training is complete, you can use your newly trained model for inference like:
|
||||
|
||||
<hfoptions id="training-inference">
|
||||
<hfoption id="PyTorch">
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionPipeline
|
||||
import torch
|
||||
@@ -254,42 +208,6 @@ image = pipeline("A <cat-toy> train", num_inference_steps=50).images[0]
|
||||
image.save("cat-train.png")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Flax">
|
||||
|
||||
Flax doesn't support the [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] method, but the textual_inversion_flax.py script [saves](https://github.com/huggingface/diffusers/blob/c0f058265161178f2a88849e92b37ffdc81f1dcc/examples/textual_inversion/textual_inversion_flax.py#L636C2-L636C2) the learned embeddings as a part of the model after training. This means you can use the model for inference like any other Flax model:
|
||||
|
||||
```py
|
||||
import jax
|
||||
import numpy as np
|
||||
from flax.jax_utils import replicate
|
||||
from flax.training.common_utils import shard
|
||||
from diffusers import FlaxStableDiffusionPipeline
|
||||
|
||||
model_path = "path-to-your-trained-model"
|
||||
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(model_path, dtype=jax.numpy.bfloat16)
|
||||
|
||||
prompt = "A <cat-toy> train"
|
||||
prng_seed = jax.random.PRNGKey(0)
|
||||
num_inference_steps = 50
|
||||
|
||||
num_samples = jax.device_count()
|
||||
prompt = num_samples * [prompt]
|
||||
prompt_ids = pipeline.prepare_inputs(prompt)
|
||||
|
||||
# shard inputs and rng
|
||||
params = replicate(params)
|
||||
prng_seed = jax.random.split(prng_seed, jax.device_count())
|
||||
prompt_ids = shard(prompt_ids)
|
||||
|
||||
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
|
||||
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
|
||||
image.save("cat-train.png")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Next steps
|
||||
|
||||
Congratulations on training your own Textual Inversion model! 🎉 To learn more about how to use your new model, the following guides may be helpful:
|
||||
|
||||
@@ -94,7 +94,7 @@ pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
pipeline.unet.load_lora_adapter(
|
||||
"jbilcke-hf/sdxl-cinematic-1",
|
||||
weight_name="pytorch_lora_weights.safetensors",
|
||||
adapter_name="cinematic"
|
||||
adapter_name="cinematic",
|
||||
prefix="unet"
|
||||
)
|
||||
# use cnmt in the prompt to trigger the LoRA
|
||||
@@ -688,4 +688,4 @@ Browse the [LoRA Studio](https://lorastudio.co/models) for different LoRAs to us
|
||||
|
||||
You can find additional LoRAs in the [FLUX LoRA the Explorer](https://huggingface.co/spaces/multimodalart/flux-lora-the-explorer) and [LoRA the Explorer](https://huggingface.co/spaces/multimodalart/LoraTheExplorer) Spaces.
|
||||
|
||||
Check out the [Fast LoRA inference for Flux with Diffusers and PEFT](https://huggingface.co/blog/lora-fast) blog post to learn how to optimize LoRA inference with methods like FlashAttention-3 and fp8 quantization.
|
||||
Check out the [Fast LoRA inference for Flux with Diffusers and PEFT](https://huggingface.co/blog/lora-fast) blog post to learn how to optimize LoRA inference with methods like FlashAttention-3 and fp8 quantization.
|
||||
|
||||
@@ -12,52 +12,37 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# Pipeline callbacks
|
||||
|
||||
The denoising loop of a pipeline can be modified with custom defined functions using the `callback_on_step_end` parameter. The callback function is executed at the end of each step, and modifies the pipeline attributes and variables for the next step. This is really useful for *dynamically* adjusting certain pipeline attributes or modifying tensor variables. This versatility allows for interesting use cases such as changing the prompt embeddings at each timestep, assigning different weights to the prompt embeddings, and editing the guidance scale. With callbacks, you can implement new features without modifying the underlying code!
|
||||
A callback is a function that modifies [`DiffusionPipeline`] behavior and it is executed at the end of a denoising step. The changes are propagated to subsequent steps in the denoising process. It is useful for adjusting pipeline attributes or tensor variables to support new features without rewriting the underlying pipeline code.
|
||||
|
||||
> [!TIP]
|
||||
> 🤗 Diffusers currently only supports `callback_on_step_end`, but feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) if you have a cool use-case and require a callback function with a different execution point!
|
||||
Diffusers provides several callbacks in the pipeline [overview](../api/pipelines/overview#callbacks).
|
||||
|
||||
This guide will demonstrate how callbacks work by a few features you can implement with them.
|
||||
To enable a callback, configure when the callback is executed after a certain number of denoising steps with one of the following arguments.
|
||||
|
||||
## Official callbacks
|
||||
- `cutoff_step_ratio` specifies when a callback is activated as a percentage of the total denoising steps.
|
||||
- `cutoff_step_index` specifies the exact step number a callback is activated.
|
||||
|
||||
We provide a list of callbacks you can plug into an existing pipeline and modify the denoising loop. This is the current list of official callbacks:
|
||||
The example below uses `cutoff_step_ratio=0.4`, which means the callback is activated once denoising reaches 40% of the total inference steps. [`~callbacks.SDXLCFGCutoffCallback`] disables classifier-free guidance (CFG) after a certain number of steps, which can help save compute without significantly affecting performance.
|
||||
|
||||
- `SDCFGCutoffCallback`: Disables the CFG after a certain number of steps for all SD 1.5 pipelines, including text-to-image, image-to-image, inpaint, and controlnet.
|
||||
- `SDXLCFGCutoffCallback`: Disables the CFG after a certain number of steps for all SDXL pipelines, including text-to-image, image-to-image, inpaint, and controlnet.
|
||||
- `IPAdapterScaleCutoffCallback`: Disables the IP Adapter after a certain number of steps for all pipelines supporting IP-Adapter.
|
||||
Define a callback with either of the `cutoff` arguments and pass it to the `callback_on_step_end` parameter in the pipeline.
|
||||
|
||||
> [!TIP]
|
||||
> If you want to add a new official callback, feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) or [submit a PR](https://huggingface.co/docs/diffusers/main/en/conceptual/contribution#how-to-open-a-pr).
|
||||
|
||||
To set up a callback, you need to specify the number of denoising steps after which the callback comes into effect. You can do so by using either one of these two arguments
|
||||
|
||||
- `cutoff_step_ratio`: Float number with the ratio of the steps.
|
||||
- `cutoff_step_index`: Integer number with the exact number of the step.
|
||||
|
||||
```python
|
||||
```py
|
||||
import torch
|
||||
|
||||
from diffusers import DPMSolverMultistepScheduler, StableDiffusionXLPipeline
|
||||
from diffusers.callbacks import SDXLCFGCutoffCallback
|
||||
|
||||
|
||||
callback = SDXLCFGCutoffCallback(cutoff_step_ratio=0.4)
|
||||
# can also be used with cutoff_step_index
|
||||
# if using cutoff_step_index
|
||||
# callback = SDXLCFGCutoffCallback(cutoff_step_ratio=None, cutoff_step_index=10)
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
).to("cuda")
|
||||
device_map="cuda"
|
||||
)
|
||||
pipeline.scheduler = DPMSolverMultistepScheduler.from_config(pipeline.scheduler.config, use_karras_sigmas=True)
|
||||
|
||||
prompt = "a sports car at the road, best quality, high quality, high detail, 8k resolution"
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(2628670641)
|
||||
|
||||
out = pipeline(
|
||||
output = pipeline(
|
||||
prompt=prompt,
|
||||
negative_prompt="",
|
||||
guidance_scale=6.5,
|
||||
@@ -65,83 +50,16 @@ out = pipeline(
|
||||
generator=generator,
|
||||
callback_on_step_end=callback,
|
||||
)
|
||||
|
||||
out.images[0].save("official_callback.png")
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/without_cfg_callback.png" alt="generated image of a sports car at the road" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">without SDXLCFGCutoffCallback</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/with_cfg_callback.png" alt="generated image of a sports car at the road with cfg callback" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">with SDXLCFGCutoffCallback</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
If you want to add a new official callback, feel free to open a [feature request](https://github.com/huggingface/diffusers/issues/new/choose) or [submit a PR](https://huggingface.co/docs/diffusers/main/en/conceptual/contribution#how-to-open-a-pr). Otherwise, you can also create your own callback as shown below.
|
||||
|
||||
## Dynamic classifier-free guidance
|
||||
## Early stopping
|
||||
|
||||
Dynamic classifier-free guidance (CFG) is a feature that allows you to disable CFG after a certain number of inference steps which can help you save compute with minimal cost to performance. The callback function for this should have the following arguments:
|
||||
|
||||
- `pipeline` (or the pipeline instance) provides access to important properties such as `num_timesteps` and `guidance_scale`. You can modify these properties by updating the underlying attributes. For this example, you'll disable CFG by setting `pipeline._guidance_scale=0.0`.
|
||||
- `step_index` and `timestep` tell you where you are in the denoising loop. Use `step_index` to turn off CFG after reaching 40% of `num_timesteps`.
|
||||
- `callback_kwargs` is a dict that contains tensor variables you can modify during the denoising loop. It only includes variables specified in the `callback_on_step_end_tensor_inputs` argument, which is passed to the pipeline's `__call__` method. Different pipelines may use different sets of variables, so please check a pipeline's `_callback_tensor_inputs` attribute for the list of variables you can modify. Some common variables include `latents` and `prompt_embeds`. For this function, change the batch size of `prompt_embeds` after setting `guidance_scale=0.0` in order for it to work properly.
|
||||
|
||||
Your callback function should look something like this:
|
||||
|
||||
```python
|
||||
def callback_dynamic_cfg(pipe, step_index, timestep, callback_kwargs):
|
||||
# adjust the batch_size of prompt_embeds according to guidance_scale
|
||||
if step_index == int(pipeline.num_timesteps * 0.4):
|
||||
prompt_embeds = callback_kwargs["prompt_embeds"]
|
||||
prompt_embeds = prompt_embeds.chunk(2)[-1]
|
||||
|
||||
# update guidance_scale and prompt_embeds
|
||||
pipeline._guidance_scale = 0.0
|
||||
callback_kwargs["prompt_embeds"] = prompt_embeds
|
||||
return callback_kwargs
|
||||
```
|
||||
|
||||
Now, you can pass the callback function to the `callback_on_step_end` parameter and the `prompt_embeds` to `callback_on_step_end_tensor_inputs`.
|
||||
Early stopping is useful if you aren't happy with the intermediate results during generation. This callback sets a hardcoded stop point after which the pipeline terminates by setting the `_interrupt` attribute to `True`.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipeline = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float16)
|
||||
pipeline = pipeline.to("cuda")
|
||||
|
||||
prompt = "a photo of an astronaut riding a horse on mars"
|
||||
|
||||
generator = torch.Generator(device="cuda").manual_seed(1)
|
||||
out = pipeline(
|
||||
prompt,
|
||||
generator=generator,
|
||||
callback_on_step_end=callback_dynamic_cfg,
|
||||
callback_on_step_end_tensor_inputs=['prompt_embeds']
|
||||
)
|
||||
|
||||
out.images[0].save("out_custom_cfg.png")
|
||||
```
|
||||
|
||||
## Interrupt the diffusion process
|
||||
|
||||
> [!TIP]
|
||||
> The interruption callback is supported for text-to-image, image-to-image, and inpainting for the [StableDiffusionPipeline](../api/pipelines/stable_diffusion/overview) and [StableDiffusionXLPipeline](../api/pipelines/stable_diffusion/stable_diffusion_xl).
|
||||
|
||||
Stopping the diffusion process early is useful when building UIs that work with Diffusers because it allows users to stop the generation process if they're unhappy with the intermediate results. You can incorporate this into your pipeline with a callback.
|
||||
|
||||
This callback function should take the following arguments: `pipeline`, `i`, `t`, and `callback_kwargs` (this must be returned). Set the pipeline's `_interrupt` attribute to `True` to stop the diffusion process after a certain number of steps. You are also free to implement your own custom stopping logic inside the callback.
|
||||
|
||||
In this example, the diffusion process is stopped after 10 steps even though `num_inference_steps` is set to 50.
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipeline = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5")
|
||||
pipeline.enable_model_cpu_offload()
|
||||
num_inference_steps = 50
|
||||
from diffusers import StableDiffusionXLPipeline
|
||||
|
||||
def interrupt_callback(pipeline, i, t, callback_kwargs):
|
||||
stop_idx = 10
|
||||
@@ -150,6 +68,11 @@ def interrupt_callback(pipeline, i, t, callback_kwargs):
|
||||
|
||||
return callback_kwargs
|
||||
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5"
|
||||
)
|
||||
num_inference_steps = 50
|
||||
|
||||
pipeline(
|
||||
"A photo of a cat",
|
||||
num_inference_steps=num_inference_steps,
|
||||
@@ -157,92 +80,11 @@ pipeline(
|
||||
)
|
||||
```
|
||||
|
||||
## IP Adapter Cutoff
|
||||
## Display intermediate images
|
||||
|
||||
IP Adapter is an image prompt adapter that can be used for diffusion models without any changes to the underlying model. We can use the IP Adapter Cutoff Callback to disable the IP Adapter after a certain number of steps. To set up the callback, you need to specify the number of denoising steps after which the callback comes into effect. You can do so by using either one of these two arguments:
|
||||
Visualizing the intermediate images is useful for progress monitoring and assessing the quality of the generated content. This callback decodes the latent tensors at each step and converts them to images.
|
||||
|
||||
- `cutoff_step_ratio`: Float number with the ratio of the steps.
|
||||
- `cutoff_step_index`: Integer number with the exact number of the step.
|
||||
|
||||
We need to download the diffusion model and load the ip_adapter for it as follows:
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16).to("cuda")
|
||||
pipeline.load_ip_adapter("h94/IP-Adapter", subfolder="sdxl_models", weight_name="ip-adapter_sdxl.bin")
|
||||
pipeline.set_ip_adapter_scale(0.6)
|
||||
```
|
||||
The setup for the callback should look something like this:
|
||||
|
||||
```py
|
||||
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
from diffusers.callbacks import IPAdapterScaleCutoffCallback
|
||||
from diffusers.utils import load_image
|
||||
import torch
|
||||
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
|
||||
pipeline.load_ip_adapter(
|
||||
"h94/IP-Adapter",
|
||||
subfolder="sdxl_models",
|
||||
weight_name="ip-adapter_sdxl.bin"
|
||||
)
|
||||
|
||||
pipeline.set_ip_adapter_scale(0.6)
|
||||
|
||||
|
||||
callback = IPAdapterScaleCutoffCallback(
|
||||
cutoff_step_ratio=None,
|
||||
cutoff_step_index=5
|
||||
)
|
||||
|
||||
image = load_image(
|
||||
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ip_adapter_diner.png"
|
||||
)
|
||||
|
||||
generator = torch.Generator(device="cuda").manual_seed(2628670641)
|
||||
|
||||
images = pipeline(
|
||||
prompt="a tiger sitting in a chair drinking orange juice",
|
||||
ip_adapter_image=image,
|
||||
negative_prompt="deformed, ugly, wrong proportion, low res, bad anatomy, worst quality, low quality",
|
||||
generator=generator,
|
||||
num_inference_steps=50,
|
||||
callback_on_step_end=callback,
|
||||
).images
|
||||
|
||||
images[0].save("custom_callback_img.png")
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/without_callback.png" alt="generated image of a tiger sitting in a chair drinking orange juice" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">without IPAdapterScaleCutoffCallback</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/with_callback2.png" alt="generated image of a tiger sitting in a chair drinking orange juice with ip adapter callback" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">with IPAdapterScaleCutoffCallback</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
|
||||
## Display image after each generation step
|
||||
|
||||
> [!TIP]
|
||||
> This tip was contributed by [asomoza](https://github.com/asomoza).
|
||||
|
||||
Display an image after each generation step by accessing and converting the latents after each step into an image. The latent space is compressed to 128x128, so the images are also 128x128 which is useful for a quick preview.
|
||||
|
||||
1. Use the function below to convert the SDXL latents (4 channels) to RGB tensors (3 channels) as explained in the [Explaining the SDXL latent space](https://huggingface.co/blog/TimothyAlexisVass/explaining-the-sdxl-latent-space) blog post.
|
||||
[Convert](https://huggingface.co/blog/TimothyAlexisVass/explaining-the-sdxl-latent-space) the Stable Diffusion XL latents from latents (4 channels) to RGB tensors (3 tensors).
|
||||
|
||||
```py
|
||||
def latents_to_rgb(latents):
|
||||
@@ -260,7 +102,7 @@ def latents_to_rgb(latents):
|
||||
return Image.fromarray(image_array)
|
||||
```
|
||||
|
||||
2. Create a function to decode and save the latents into an image.
|
||||
Extract the latents and convert the first image in the batch to RGB. Save the image as a PNG file with the step number.
|
||||
|
||||
```py
|
||||
def decode_tensors(pipe, step, timestep, callback_kwargs):
|
||||
@@ -272,19 +114,18 @@ def decode_tensors(pipe, step, timestep, callback_kwargs):
|
||||
return callback_kwargs
|
||||
```
|
||||
|
||||
3. Pass the `decode_tensors` function to the `callback_on_step_end` parameter to decode the tensors after each step. You also need to specify what you want to modify in the `callback_on_step_end_tensor_inputs` parameter, which in this case are the latents.
|
||||
Use the `callback_on_step_end_tensor_inputs` parameter to specify what input type to modify, which in this case, are the latents.
|
||||
|
||||
```py
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
import torch
|
||||
from PIL import Image
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
|
||||
pipeline = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True
|
||||
).to("cuda")
|
||||
device_map="cuda"
|
||||
)
|
||||
|
||||
image = pipeline(
|
||||
prompt="A croissant shaped like a cute bear.",
|
||||
@@ -293,27 +134,3 @@ image = pipeline(
|
||||
callback_on_step_end_tensor_inputs=["latents"],
|
||||
).images[0]
|
||||
```
|
||||
|
||||
<div class="flex gap-4 justify-center">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/tips_step_0.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">step 0</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/tips_step_19.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">step 19
|
||||
</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/tips_step_29.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">step 29</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/tips_step_39.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">step 39</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/tips_step_49.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">step 49</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -10,376 +10,163 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Load community pipelines and components
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
## Community pipelines
|
||||
# Community pipelines and components
|
||||
|
||||
> [!TIP] Take a look at GitHub Issue [#841](https://github.com/huggingface/diffusers/issues/841) for more context about why we're adding community pipelines to help everyone easily share their work without being slowed down.
|
||||
|
||||
Community pipelines are any [`DiffusionPipeline`] class that are different from the original paper implementation (for example, the [`StableDiffusionControlNetPipeline`] corresponds to the [Text-to-Image Generation with ControlNet Conditioning](https://huggingface.co/papers/2302.05543) paper). They provide additional functionality or extend the original implementation of a pipeline.
|
||||
|
||||
There are many cool community pipelines like [Marigold Depth Estimation](https://github.com/huggingface/diffusers/tree/main/examples/community#marigold-depth-estimation) or [InstantID](https://github.com/huggingface/diffusers/tree/main/examples/community#instantid-pipeline), and you can find all the official community pipelines [here](https://github.com/huggingface/diffusers/tree/main/examples/community).
|
||||
|
||||
There are two types of community pipelines, those stored on the Hugging Face Hub and those stored on Diffusers GitHub repository. Hub pipelines are completely customizable (scheduler, models, pipeline code, etc.) while Diffusers GitHub pipelines are only limited to custom pipeline code.
|
||||
|
||||
| | GitHub community pipeline | HF Hub community pipeline |
|
||||
|----------------|------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------|
|
||||
| usage | same | same |
|
||||
| review process | open a Pull Request on GitHub and undergo a review process from the Diffusers team before merging; may be slower | upload directly to a Hub repository without any review; this is the fastest workflow |
|
||||
| visibility | included in the official Diffusers repository and documentation | included on your HF Hub profile and relies on your own usage/promotion to gain visibility |
|
||||
|
||||
<hfoptions id="community">
|
||||
<hfoption id="Hub pipelines">
|
||||
|
||||
To load a Hugging Face Hub community pipeline, pass the repository id of the community pipeline to the `custom_pipeline` argument and the model repository where you'd like to load the pipeline weights and components from. For example, the example below loads a dummy pipeline from [hf-internal-testing/diffusers-dummy-pipeline](https://huggingface.co/hf-internal-testing/diffusers-dummy-pipeline/blob/main/pipeline.py) and the pipeline weights and components from [google/ddpm-cifar10-32](https://huggingface.co/google/ddpm-cifar10-32):
|
||||
|
||||
> [!WARNING]
|
||||
> By loading a community pipeline from the Hugging Face Hub, you are trusting that the code you are loading is safe. Make sure to inspect the code online before loading and running it automatically!
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"google/ddpm-cifar10-32", custom_pipeline="hf-internal-testing/diffusers-dummy-pipeline", use_safetensors=True
|
||||
)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="GitHub pipelines">
|
||||
|
||||
To load a GitHub community pipeline, pass the repository id of the community pipeline to the `custom_pipeline` argument and the model repository where you you'd like to load the pipeline weights and components from. You can also load model components directly. The example below loads the community [CLIP Guided Stable Diffusion](https://github.com/huggingface/diffusers/tree/main/examples/community#clip-guided-stable-diffusion) pipeline and the CLIP model components.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
from transformers import CLIPImageProcessor, CLIPModel
|
||||
|
||||
clip_model_id = "laion/CLIP-ViT-B-32-laion2B-s34B-b79K"
|
||||
|
||||
feature_extractor = CLIPImageProcessor.from_pretrained(clip_model_id)
|
||||
clip_model = CLIPModel.from_pretrained(clip_model_id)
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
custom_pipeline="clip_guided_stable_diffusion",
|
||||
clip_model=clip_model,
|
||||
feature_extractor=feature_extractor,
|
||||
use_safetensors=True,
|
||||
)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### Load from a local file
|
||||
|
||||
Community pipelines can also be loaded from a local file if you pass a file path instead. The path to the passed directory must contain a pipeline.py file that contains the pipeline class.
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
custom_pipeline="./path/to/pipeline_directory/",
|
||||
clip_model=clip_model,
|
||||
feature_extractor=feature_extractor,
|
||||
use_safetensors=True,
|
||||
)
|
||||
```
|
||||
|
||||
### Load from a specific version
|
||||
|
||||
By default, community pipelines are loaded from the latest stable version of Diffusers. To load a community pipeline from another version, use the `custom_revision` parameter.
|
||||
|
||||
<hfoptions id="version">
|
||||
<hfoption id="main">
|
||||
|
||||
For example, to load from the main branch:
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
custom_pipeline="clip_guided_stable_diffusion",
|
||||
custom_revision="main",
|
||||
clip_model=clip_model,
|
||||
feature_extractor=feature_extractor,
|
||||
use_safetensors=True,
|
||||
)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="older version">
|
||||
|
||||
For example, to load from a previous version of Diffusers like v0.25.0:
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
custom_pipeline="clip_guided_stable_diffusion",
|
||||
custom_revision="v0.25.0",
|
||||
clip_model=clip_model,
|
||||
feature_extractor=feature_extractor,
|
||||
use_safetensors=True,
|
||||
)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### Load with from_pipe
|
||||
|
||||
Community pipelines can also be loaded with the [`~DiffusionPipeline.from_pipe`] method which allows you to load and reuse multiple pipelines without any additional memory overhead (learn more in the [Reuse a pipeline](./loading#reuse-a-pipeline) guide). The memory requirement is determined by the largest single pipeline loaded.
|
||||
|
||||
For example, let's load a community pipeline that supports [long prompts with weighting](https://github.com/huggingface/diffusers/tree/main/examples/community#long-prompt-weighting-stable-diffusion) from a Stable Diffusion pipeline.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipe_sd = DiffusionPipeline.from_pretrained("emilianJR/CyberRealistic_V3", torch_dtype=torch.float16)
|
||||
pipe_sd.to("cuda")
|
||||
# load long prompt weighting pipeline
|
||||
pipe_lpw = DiffusionPipeline.from_pipe(
|
||||
pipe_sd,
|
||||
custom_pipeline="lpw_stable_diffusion",
|
||||
).to("cuda")
|
||||
|
||||
prompt = "cat, hiding in the leaves, ((rain)), zazie rainyday, beautiful eyes, macro shot, colorful details, natural lighting, amazing composition, subsurface scattering, amazing textures, filmic, soft light, ultra-detailed eyes, intricate details, detailed texture, light source contrast, dramatic shadows, cinematic light, depth of field, film grain, noise, dark background, hyperrealistic dslr film still, dim volumetric cinematic lighting"
|
||||
neg_prompt = "(deformed iris, deformed pupils, semi-realistic, cgi, 3d, render, sketch, cartoon, drawing, anime, mutated hands and fingers:1.4), (deformed, distorted, disfigured:1.3), poorly drawn, bad anatomy, wrong anatomy, extra limb, missing limb, floating limbs, disconnected limbs, mutation, mutated, ugly, disgusting, amputation"
|
||||
generator = torch.Generator(device="cpu").manual_seed(20)
|
||||
out_lpw = pipe_lpw(
|
||||
prompt,
|
||||
negative_prompt=neg_prompt,
|
||||
width=512,
|
||||
height=512,
|
||||
max_embeddings_multiples=3,
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
out_lpw
|
||||
```
|
||||
|
||||
<div class="flex gap-4">
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/from_pipe_lpw.png" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">Stable Diffusion with long prompt weighting</figcaption>
|
||||
</div>
|
||||
<div>
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/from_pipe_non_lpw.png" />
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">Stable Diffusion</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
## Example community pipelines
|
||||
|
||||
Community pipelines are a really fun and creative way to extend the capabilities of the original pipeline with new and unique features. You can find all community pipelines in the [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) folder with inference and training examples for how to use them.
|
||||
|
||||
This section showcases a couple of the community pipelines and hopefully it'll inspire you to create your own (feel free to open a PR for your community pipeline and ping us for a review)!
|
||||
Community pipelines are [`DiffusionPipeline`] classes that are different from the original paper implementation. They provide additional functionality or extend the original pipeline implementation.
|
||||
|
||||
> [!TIP]
|
||||
> The [`~DiffusionPipeline.from_pipe`] method is particularly useful for loading community pipelines because many of them don't have pretrained weights and add a feature on top of an existing pipeline like Stable Diffusion or Stable Diffusion XL. You can learn more about the [`~DiffusionPipeline.from_pipe`] method in the [Load with from_pipe](custom_pipeline_overview#load-with-from_pipe) section.
|
||||
> Check out the community pipelines in [diffusers/examples/community](https://github.com/huggingface/diffusers/tree/main/examples/community) with inference and training examples for how to use them.
|
||||
|
||||
<hfoptions id="community">
|
||||
<hfoption id="Marigold">
|
||||
Community pipelines are either stored on the Hub or the Diffusers' GitHub repository. Hub pipelines are completely customizable (scheduler, models, pipeline code, etc.) while GitHub pipelines are limited to only the custom pipeline code. Further compare the two community pipeline types in the table below.
|
||||
|
||||
[Marigold](https://marigoldmonodepth.github.io/) is a depth estimation diffusion pipeline that uses the rich existing and inherent visual knowledge in diffusion models. It takes an input image and denoises and decodes it into a depth map. Marigold performs well even on images it hasn't seen before.
|
||||
| | GitHub | Hub |
|
||||
|---|---|---|
|
||||
| Usage | Same. | Same. |
|
||||
| Review process | Open a Pull Request on GitHub and undergo a review process from the Diffusers team before merging. This option is slower. | Upload directly to a Hub repository without a review. This is the fastest option. |
|
||||
| Visibility | Included in the official Diffusers repository and docs. | Included on your Hub profile and relies on your own usage and promotion to gain visibility. |
|
||||
|
||||
## custom_pipeline
|
||||
|
||||
Load either community pipeline types by passing the `custom_pipeline` argument to [`~DiffusionPipeline.from_pretrained`].
|
||||
|
||||
```py
|
||||
import torch
|
||||
from PIL import Image
|
||||
from diffusers import DiffusionPipeline
|
||||
from diffusers.utils import load_image
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"prs-eth/marigold-lcm-v1-0",
|
||||
custom_pipeline="marigold_depth_estimation",
|
||||
"stabilityai/stable-diffusion-3-medium-diffusers",
|
||||
custom_pipeline="pipeline_stable_diffusion_3_instruct_pix2pix",
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
device_map="cuda"
|
||||
)
|
||||
|
||||
pipeline.to("cuda")
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/community-marigold.png")
|
||||
output = pipeline(
|
||||
image,
|
||||
denoising_steps=4,
|
||||
ensemble_size=5,
|
||||
processing_res=768,
|
||||
match_input_res=True,
|
||||
batch_size=0,
|
||||
seed=33,
|
||||
color_map="Spectral",
|
||||
show_progress_bar=True,
|
||||
)
|
||||
depth_colored: Image.Image = output.depth_colored
|
||||
depth_colored.save("./depth_colored.png")
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/community-marigold.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/marigold-depth.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">colorized depth image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="HD-Painter">
|
||||
|
||||
[HD-Painter](https://hf.co/papers/2312.14091) is a high-resolution inpainting pipeline. It introduces a *Prompt-Aware Introverted Attention (PAIntA)* layer to better align a prompt with the area to be inpainted, and *Reweighting Attention Score Guidance (RASG)* to keep the latents more prompt-aligned and within their trained domain to generate realistc images.
|
||||
Add the `custom_revision` argument to [`~DiffusionPipeline.from_pretrained`] to load a community pipeline from a specific version (for example, `v0.30.0` or `main`). By default, community pipelines are loaded from the latest stable version of Diffusers.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline, DDIMScheduler
|
||||
from diffusers.utils import load_image
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5-inpainting",
|
||||
custom_pipeline="hd_painter"
|
||||
"stabilityai/stable-diffusion-3-medium-diffusers",
|
||||
custom_pipeline="pipeline_stable_diffusion_3_instruct_pix2pix",
|
||||
custom_revision="main"
|
||||
torch_dtype=torch.float16,
|
||||
device_map="cuda"
|
||||
)
|
||||
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter.jpg")
|
||||
mask_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter-mask.png")
|
||||
prompt = "football"
|
||||
image = pipeline(prompt, init_image, mask_image, use_rasg=True, use_painta=True, generator=torch.manual_seed(0)).images[0]
|
||||
image
|
||||
```
|
||||
|
||||
<div class="flex flex-row gap-4">
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter.jpg"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">original image</figcaption>
|
||||
</div>
|
||||
<div class="flex-1">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/hd-painter-output.png"/>
|
||||
<figcaption class="mt-2 text-center text-sm text-gray-500">generated image</figcaption>
|
||||
</div>
|
||||
</div>
|
||||
> [!WARNING]
|
||||
> While the Hugging Face Hub [scans](https://huggingface.co/docs/hub/security-malware) files, you should still inspect the Hub pipeline code and make sure it is safe.
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
There are a few ways to load a community pipeline.
|
||||
|
||||
- Pass a path to `custom_pipeline` to load a local community pipeline. The directory must contain a `pipeline.py` file containing the pipeline class.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-3-medium-diffusers",
|
||||
custom_pipeline="path/to/pipeline_directory",
|
||||
torch_dtype=torch.float16,
|
||||
device_map="cuda"
|
||||
)
|
||||
```
|
||||
|
||||
- The `custom_pipeline` argument is also supported by [`~DiffusionPipeline.from_pipe`], which is useful for [reusing pipelines](./loading#reuse-a-pipeline) without using additional memory. It limits the memory usage to only the largest pipeline loaded.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline_sd = DiffusionPipeline.from_pretrained("emilianJR/CyberRealistic_V3", torch_dtype=torch.float16, device_map="cuda")
|
||||
pipeline_lpw = DiffusionPipeline.from_pipe(
|
||||
pipeline_sd, custom_pipeline="lpw_stable_diffusion", device_map="cuda"
|
||||
)
|
||||
```
|
||||
|
||||
The [`~DiffusionPipeline.from_pipe`] method is especially useful for loading community pipelines because many of them don't have pretrained weights. Community pipelines generally add a feature on top of an existing pipeline.
|
||||
|
||||
## Community components
|
||||
|
||||
Community components allow users to build pipelines that may have customized components that are not a part of Diffusers. If your pipeline has custom components that Diffusers doesn't already support, you need to provide their implementations as Python modules. These customized components could be a VAE, UNet, and scheduler. In most cases, the text encoder is imported from the Transformers library. The pipeline code itself can also be customized.
|
||||
Community components let users build pipelines with custom transformers, UNets, VAEs, and schedulers not supported by Diffusers. These components require Python module implementations.
|
||||
|
||||
This section shows how users should use community components to build a community pipeline.
|
||||
This section shows how users can use community components to build a community pipeline using [showlab/show-1-base](https://huggingface.co/showlab/show-1-base) as an example.
|
||||
|
||||
You'll use the [showlab/show-1-base](https://huggingface.co/showlab/show-1-base) pipeline checkpoint as an example.
|
||||
|
||||
1. Import and load the text encoder from Transformers:
|
||||
|
||||
```python
|
||||
from transformers import T5Tokenizer, T5EncoderModel
|
||||
|
||||
pipe_id = "showlab/show-1-base"
|
||||
tokenizer = T5Tokenizer.from_pretrained(pipe_id, subfolder="tokenizer")
|
||||
text_encoder = T5EncoderModel.from_pretrained(pipe_id, subfolder="text_encoder")
|
||||
```
|
||||
|
||||
2. Load a scheduler:
|
||||
1. Load the required components, the scheduler and image processor. The text encoder is generally imported from [Transformers](https://huggingface.co/docs/transformers/index).
|
||||
|
||||
```python
|
||||
from transformers import T5Tokenizer, T5EncoderModel, CLIPImageProcessor
|
||||
from diffusers import DPMSolverMultistepScheduler
|
||||
|
||||
pipeline_id = "showlab/show-1-base"
|
||||
tokenizer = T5Tokenizer.from_pretrained(pipeline_id, subfolder="tokenizer")
|
||||
text_encoder = T5EncoderModel.from_pretrained(pipeline_id, subfolder="text_encoder")
|
||||
scheduler = DPMSolverMultistepScheduler.from_pretrained(pipe_id, subfolder="scheduler")
|
||||
```
|
||||
|
||||
3. Load an image processor:
|
||||
|
||||
```python
|
||||
from transformers import CLIPImageProcessor
|
||||
|
||||
feature_extractor = CLIPImageProcessor.from_pretrained(pipe_id, subfolder="feature_extractor")
|
||||
```
|
||||
|
||||
<Tip warning={true}>
|
||||
> [!WARNING]
|
||||
> In steps 2 and 3, the custom [UNet](https://github.com/showlab/Show-1/blob/main/showone/models/unet_3d_condition.py) and [pipeline](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) implementation must match the format shown in their files for this example to work.
|
||||
|
||||
In steps 4 and 5, the custom [UNet](https://github.com/showlab/Show-1/blob/main/showone/models/unet_3d_condition.py) and [pipeline](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) implementation must match the format shown in their files for this example to work.
|
||||
|
||||
</Tip>
|
||||
|
||||
4. Now you'll load a [custom UNet](https://github.com/showlab/Show-1/blob/main/showone/models/unet_3d_condition.py), which in this example, has already been implemented in [showone_unet_3d_condition.py](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) for your convenience. You'll notice the [`UNet3DConditionModel`] class name is changed to `ShowOneUNet3DConditionModel` because [`UNet3DConditionModel`] already exists in Diffusers. Any components needed for the `ShowOneUNet3DConditionModel` class should be placed in showone_unet_3d_condition.py.
|
||||
|
||||
Once this is done, you can initialize the UNet:
|
||||
|
||||
```python
|
||||
from showone_unet_3d_condition import ShowOneUNet3DConditionModel
|
||||
|
||||
unet = ShowOneUNet3DConditionModel.from_pretrained(pipe_id, subfolder="unet")
|
||||
```
|
||||
|
||||
5. Finally, you'll load the custom pipeline code. For this example, it has already been created for you in [pipeline_t2v_base_pixel.py](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/pipeline_t2v_base_pixel.py). This script contains a custom `TextToVideoIFPipeline` class for generating videos from text. Just like the custom UNet, any code needed for the custom pipeline to work should go in pipeline_t2v_base_pixel.py.
|
||||
|
||||
Once everything is in place, you can initialize the `TextToVideoIFPipeline` with the `ShowOneUNet3DConditionModel`:
|
||||
2. Load a [custom UNet](https://github.com/showlab/Show-1/blob/main/showone/models/unet_3d_condition.py) which is already implemented in [showone_unet_3d_condition.py](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py). The [`UNet3DConditionModel`] class name is renamed to the custom implementation, `ShowOneUNet3DConditionModel`, because [`UNet3DConditionModel`] already exists in Diffusers. Any components required for `ShowOneUNet3DConditionModel` class should be placed in `showone_unet_3d_condition.py`.
|
||||
|
||||
```python
|
||||
from showone_unet_3d_condition import ShowOneUNet3DConditionModel
|
||||
|
||||
unet = ShowOneUNet3DConditionModel.from_pretrained(pipeline_id, subfolder="unet")
|
||||
```
|
||||
|
||||
3. Load the custom pipeline code (already implemented in [pipeline_t2v_base_pixel.py](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/pipeline_t2v_base_pixel.py)). This script contains a custom `TextToVideoIFPipeline` class for generating videos from text. Like the custom UNet, any code required for `TextToVideIFPipeline` should be placed in `pipeline_t2v_base_pixel.py`.
|
||||
|
||||
Initialize `TextToVideoIFPipeline` with `ShowOneUNet3DConditionModel`.
|
||||
|
||||
```python
|
||||
from pipeline_t2v_base_pixel import TextToVideoIFPipeline
|
||||
import torch
|
||||
from pipeline_t2v_base_pixel import TextToVideoIFPipeline
|
||||
|
||||
pipeline = TextToVideoIFPipeline(
|
||||
unet=unet,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
scheduler=scheduler,
|
||||
feature_extractor=feature_extractor
|
||||
feature_extractor=feature_extractor,
|
||||
device_map="cuda",
|
||||
torch_dtype=torch.float16
|
||||
)
|
||||
pipeline = pipeline.to(device="cuda")
|
||||
pipeline.torch_dtype = torch.float16
|
||||
```
|
||||
|
||||
Push the pipeline to the Hub to share with the community!
|
||||
4. Push the pipeline to the Hub to share with the community.
|
||||
|
||||
```python
|
||||
pipeline.push_to_hub("custom-t2v-pipeline")
|
||||
```
|
||||
|
||||
After the pipeline is successfully pushed, you need to make a few changes:
|
||||
After the pipeline is successfully pushed, make the following changes.
|
||||
|
||||
1. Change the `_class_name` attribute in [model_index.json](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/model_index.json#L2) to `"pipeline_t2v_base_pixel"` and `"TextToVideoIFPipeline"`.
|
||||
2. Upload `showone_unet_3d_condition.py` to the [unet](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) subfolder.
|
||||
3. Upload `pipeline_t2v_base_pixel.py` to the pipeline [repository](https://huggingface.co/sayakpaul/show-1-base-with-code/tree/main).
|
||||
- Change the `_class_name` attribute in [model_index.json](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/model_index.json#L2) to `"pipeline_t2v_base_pixel"` and `"TextToVideoIFPipeline"`.
|
||||
- Upload `showone_unet_3d_condition.py` to the [unet](https://huggingface.co/sayakpaul/show-1-base-with-code/blob/main/unet/showone_unet_3d_condition.py) subfolder.
|
||||
- Upload `pipeline_t2v_base_pixel.py` to the pipeline [repository](https://huggingface.co/sayakpaul/show-1-base-with-code/tree/main).
|
||||
|
||||
To run inference, add the `trust_remote_code` argument while initializing the pipeline to handle all the "magic" behind the scenes.
|
||||
|
||||
> [!WARNING]
|
||||
> As an additional precaution with `trust_remote_code=True`, we strongly encourage you to pass a commit hash to the `revision` parameter in [`~DiffusionPipeline.from_pretrained`] to make sure the code hasn't been updated with some malicious new lines of code (unless you fully trust the model owners).
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"<change-username>/<change-id>", trust_remote_code=True, torch_dtype=torch.float16
|
||||
).to("cuda")
|
||||
|
||||
prompt = "hello"
|
||||
|
||||
# Text embeds
|
||||
prompt_embeds, negative_embeds = pipeline.encode_prompt(prompt)
|
||||
|
||||
# Keyframes generation (8x64x40, 2fps)
|
||||
video_frames = pipeline(
|
||||
prompt_embeds=prompt_embeds,
|
||||
negative_prompt_embeds=negative_embeds,
|
||||
num_frames=8,
|
||||
height=40,
|
||||
width=64,
|
||||
num_inference_steps=2,
|
||||
guidance_scale=9.0,
|
||||
output_type="pt"
|
||||
).frames
|
||||
```
|
||||
|
||||
As an additional reference, take a look at the repository structure of [stabilityai/japanese-stable-diffusion-xl](https://huggingface.co/stabilityai/japanese-stable-diffusion-xl/) which also uses the `trust_remote_code` feature.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/japanese-stable-diffusion-xl", trust_remote_code=True
|
||||
)
|
||||
pipeline.to("cuda")
|
||||
```
|
||||
|
||||
> [!WARNING]
|
||||
> As an additional precaution with `trust_remote_code=True`, we strongly encourage passing a commit hash to the `revision` argument in [`~DiffusionPipeline.from_pretrained`] to make sure the code hasn't been updated with new malicious code (unless you fully trust the model owners).
|
||||
|
||||
## Resources
|
||||
|
||||
- Take a look at Issue [#841](https://github.com/huggingface/diffusers/issues/841) for more context about why we're adding community pipelines to help everyone easily share their work without being slowed down.
|
||||
- Check out the [stabilityai/japanese-stable-diffusion-xl](https://huggingface.co/stabilityai/japanese-stable-diffusion-xl/) repository for an additional example of a community pipeline that also uses the `trust_remote_code` feature.
|
||||
@@ -10,116 +10,166 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Load pipelines
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
Diffusion systems consist of multiple components like parameterized models and schedulers that interact in complex ways. That is why we designed the [`DiffusionPipeline`] to wrap the complexity of the entire diffusion system into an easy-to-use API. At the same time, the [`DiffusionPipeline`] is entirely customizable so you can modify each component to build a diffusion system for your use case.
|
||||
# DiffusionPipeline
|
||||
|
||||
This guide will show you how to load:
|
||||
Diffusion models consists of multiple components like UNets or diffusion transformers (DiTs), text encoders, variational autoencoders (VAEs), and schedulers. The [`DiffusionPipeline`] wraps all of these components into a single easy-to-use API without giving up the flexibility to modify it's components.
|
||||
|
||||
- pipelines from the Hub and locally
|
||||
- different components into a pipeline
|
||||
- multiple pipelines without increasing memory usage
|
||||
- checkpoint variants such as different floating point types or non-exponential mean averaged (EMA) weights
|
||||
This guide will show you how to load a [`DiffusionPipeline`].
|
||||
|
||||
## Load a pipeline
|
||||
## Loading a pipeline
|
||||
|
||||
> [!TIP]
|
||||
> Skip to the [DiffusionPipeline explained](#diffusionpipeline-explained) section if you're interested in an explanation about how the [`DiffusionPipeline`] class works.
|
||||
[`DiffusionPipeline`] is a base pipeline class that automatically selects and returns an instance of a model's pipeline subclass, like [`QwenImagePipeline`], by scanning the `model_index.json` file for the class name.
|
||||
|
||||
There are two ways to load a pipeline for a task:
|
||||
|
||||
1. Load the generic [`DiffusionPipeline`] class and allow it to automatically detect the correct pipeline class from the checkpoint.
|
||||
2. Load a specific pipeline class for a specific task.
|
||||
|
||||
<hfoptions id="pipelines">
|
||||
<hfoption id="generic pipeline">
|
||||
|
||||
The [`DiffusionPipeline`] class is a simple and generic way to load the latest trending diffusion model from the [Hub](https://huggingface.co/models?library=diffusers&sort=trending). It uses the [`~DiffusionPipeline.from_pretrained`] method to automatically detect the correct pipeline class for a task from the checkpoint, downloads and caches all the required configuration and weight files, and returns a pipeline ready for inference.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", use_safetensors=True)
|
||||
```
|
||||
|
||||
This same checkpoint can also be used for an image-to-image task. The [`DiffusionPipeline`] class can handle any task as long as you provide the appropriate inputs. For example, for an image-to-image task, you need to pass an initial image to the pipeline.
|
||||
Pass a model id to [`~DiffusionPipeline.from_pretrained`] to load a pipeline.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", use_safetensors=True)
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"Qwen/Qwen-Image", torch_dtype=torch.bfloat16, device_map="cuda"
|
||||
)
|
||||
```
|
||||
|
||||
init_image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/img2img-init.png")
|
||||
prompt = "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
|
||||
image = pipeline("Astronaut in a jungle, cold color palette, muted colors, detailed, 8k", image=init_image).images[0]
|
||||
Every model has a specific pipeline subclass that inherits from [`DiffusionPipeline`]. A subclass usually has a narrow focus and are task-specific. See the table below for an example.
|
||||
|
||||
| pipeline subclass | task |
|
||||
|---|---|
|
||||
| [`QwenImagePipeline`] | text-to-image |
|
||||
| [`QwenImageImg2ImgPipeline`] | image-to-image |
|
||||
| [`QwenImageInpaintPipeline`] | inpaint |
|
||||
|
||||
You could use the subclass directly by passing a model id to [`~QwenImagePipeline.from_pretrained`].
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import QwenImagePipeline
|
||||
|
||||
pipeline = QwenImagePipeline.from_pretrained(
|
||||
"Qwen/Qwen-Image", torch_dtype=torch.bfloat16, device_map="cuda"
|
||||
)
|
||||
```
|
||||
|
||||
### Local pipelines
|
||||
|
||||
Pipelines can also be run locally. Use [`~huggingface_hub.snapshot_download`] to download a model repository.
|
||||
|
||||
```py
|
||||
from huggingface_hub import snapshot_download
|
||||
|
||||
snapshot_download(repo_id="Qwen/Qwen-Image")
|
||||
```
|
||||
|
||||
The model is downloaded to your [cache](../installation#cache). Pass the folder path to [`~QwenImagePipeline.from_pretrained`] to load it.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import QwenImagePipeline
|
||||
|
||||
pipeline = QwenImagePipeline.from_pretrained(
|
||||
"path/to/your/cache", torch_dtype=torch.bfloat16, device_map="cuda"
|
||||
)
|
||||
```
|
||||
|
||||
The [`~QwenImagePipeline.from_pretrained`] method won't download files from the Hub when it detects a local path. But this also means it won't download and cache any updates that have been made to the model either.
|
||||
|
||||
## Pipeline data types
|
||||
|
||||
Use the `torch_dtype` argument in [`~DiffusionPipeline.from_pretrained`] to load a model with a specific data type. This allows you to load different models in different precisions. For example, loading a large transformer model in half-precision reduces the memory required.
|
||||
|
||||
Pass the data type for each model as a dictionary to `torch_dtype`. Use the `default` key to set the default data type. If a model isn't in the dictionary and `default` isn't provided, it is loaded in full precision (`torch.float32`).
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import QwenImagePipeline
|
||||
|
||||
pipeline = QwenImagePipeline.from_pretrained(
|
||||
"Qwen/Qwen-Image",
|
||||
torch_dtype={"transformer": torch.bfloat16, "default": torch.float16},
|
||||
)
|
||||
print(pipeline.transformer.dtype, pipeline.vae.dtype)
|
||||
```
|
||||
|
||||
You don't need to use a dictionary if you're loading all the models in the same data type.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import QwenImagePipeline
|
||||
|
||||
pipeline = QwenImagePipeline.from_pretrained(
|
||||
"Qwen/Qwen-Image", torch_dtype=torch.bfloat16
|
||||
)
|
||||
print(pipeline.transformer.dtype, pipeline.vae.dtype)
|
||||
```
|
||||
|
||||
## Device placement
|
||||
|
||||
The `device_map` argument determines individual model or pipeline placement on an accelerator like a GPU. It is especially helpful when there are multiple GPUs.
|
||||
|
||||
Diffusers currently provides three options to `device_map`, `"cuda"`, `"balanced"` and `"auto"`. Refer to the table below to compare the three placement strategies.
|
||||
|
||||
| parameter | description |
|
||||
|---|---|
|
||||
| `"cuda"` | places model or pipeline on CUDA device |
|
||||
| `"balanced"` | evenly distributes model or pipeline on all GPUs |
|
||||
| `"auto"` | distribute model from fastest device first to slowest |
|
||||
|
||||
Use the `max_memory` argument in [`~DiffusionPipeline.from_pretrained`] to allocate a maximum amount of memory to use on each device. By default, Diffusers uses the maximum amount available.
|
||||
|
||||
<hfoptions id="device_map">
|
||||
<hfoption id="pipeline">
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"Qwen/Qwen-Image",
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="cuda",
|
||||
)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="specific pipeline">
|
||||
|
||||
Checkpoints can be loaded by their specific pipeline class if you already know it. For example, to load a Stable Diffusion model, use the [`StableDiffusionPipeline`] class.
|
||||
|
||||
```python
|
||||
from diffusers import StableDiffusionPipeline
|
||||
|
||||
pipeline = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", use_safetensors=True)
|
||||
```
|
||||
|
||||
This same checkpoint may also be used for another task like image-to-image. To differentiate what task you want to use the checkpoint for, you have to use the corresponding task-specific pipeline class. For example, to use the same checkpoint for image-to-image, use the [`StableDiffusionImg2ImgPipeline`] class.
|
||||
<hfoption id="individual model">
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionImg2ImgPipeline
|
||||
import torch
|
||||
from diffusers import AutoModel
|
||||
|
||||
pipeline = StableDiffusionImg2ImgPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", use_safetensors=True)
|
||||
max_memory = {0: "16GB", 1: "16GB"}
|
||||
transformer = AutoModel.from_pretrained(
|
||||
"Qwen/Qwen-Image",
|
||||
subfolder="transformer",
|
||||
torch_dtype=torch.bfloat16
|
||||
device_map="cuda",
|
||||
max_memory=max_memory
|
||||
)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
Use the Space below to gauge a pipeline's memory requirements before you download and load it to see if it runs on your hardware.
|
||||
The `hf_device_map` attribute allows you to access and view the `device_map`.
|
||||
|
||||
<div class="block dark:hidden">
|
||||
<iframe
|
||||
src="https://diffusers-compute-pipeline-size.hf.space?__theme=light"
|
||||
width="850"
|
||||
height="1600"
|
||||
></iframe>
|
||||
</div>
|
||||
<div class="hidden dark:block">
|
||||
<iframe
|
||||
src="https://diffusers-compute-pipeline-size.hf.space?__theme=dark"
|
||||
width="850"
|
||||
height="1600"
|
||||
></iframe>
|
||||
</div>
|
||||
|
||||
### Specifying Component-Specific Data Types
|
||||
|
||||
You can customize the data types for individual sub-models by passing a dictionary to the `torch_dtype` parameter. This allows you to load different components of a pipeline in different floating point precisions. For instance, if you want to load the transformer with `torch.bfloat16` and all other components with `torch.float16`, you can pass a dictionary mapping:
|
||||
|
||||
```python
|
||||
from diffusers import HunyuanVideoPipeline
|
||||
import torch
|
||||
|
||||
pipe = HunyuanVideoPipeline.from_pretrained(
|
||||
"hunyuanvideo-community/HunyuanVideo",
|
||||
torch_dtype={"transformer": torch.bfloat16, "default": torch.float16},
|
||||
)
|
||||
print(pipe.transformer.dtype, pipe.vae.dtype) # (torch.bfloat16, torch.float16)
|
||||
```py
|
||||
print(pipeline.hf_device_map)
|
||||
# {'unet': 1, 'vae': 1, 'safety_checker': 0, 'text_encoder': 0}
|
||||
```
|
||||
|
||||
If a component is not explicitly specified in the dictionary and no `default` is provided, it will be loaded with `torch.float32`.
|
||||
Reset a pipeline's `device_map` with the [`~DiffusionPipeline.reset_device_map`] method. This is necessary if you want to use methods such as `.to()`, [`~DiffusionPipeline.enable_sequential_cpu_offload`], and [`~DiffusionPipeline.enable_model_cpu_offload`].
|
||||
|
||||
### Parallel loading
|
||||
```py
|
||||
pipeline.reset_device_map()
|
||||
```
|
||||
|
||||
## Parallel loading
|
||||
|
||||
Large models are often [sharded](../training/distributed_inference#model-sharding) into smaller files so that they are easier to load. Diffusers supports loading shards in parallel to speed up the loading process.
|
||||
|
||||
Set the environment variables below to enable parallel loading.
|
||||
|
||||
- Set `HF_ENABLE_PARALLEL_LOADING` to `"YES"` to enable parallel loading of shards.
|
||||
- Set `HF_PARALLEL_LOADING_WORKERS` to configure the number of parallel threads to use when loading shards. More workers loads a model faster but uses more memory.
|
||||
Set `HF_ENABLE_PARALLEL_LOADING` to `"YES"` to enable parallel loading of shards.
|
||||
|
||||
The `device_map` argument should be set to `"cuda"` to pre-allocate a large chunk of memory based on the model size. This substantially reduces model load time because warming up the memory allocator now avoids many smaller calls to the allocator later.
|
||||
|
||||
@@ -129,479 +179,98 @@ import torch
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
os.environ["HF_ENABLE_PARALLEL_LOADING"] = "YES"
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"Wan-AI/Wan2.2-I2V-A14B-Diffusers",
|
||||
torch_dtype=torch.bfloat16,
|
||||
device_map="cuda"
|
||||
"Wan-AI/Wan2.2-I2V-A14B-Diffusers", torch_dtype=torch.bfloat16, device_map="cuda"
|
||||
)
|
||||
```
|
||||
|
||||
### Local pipeline
|
||||
## Replacing models in a pipeline
|
||||
|
||||
To load a pipeline locally, use [git-lfs](https://git-lfs.github.com/) to manually download a checkpoint to your local disk.
|
||||
[`DiffusionPipeline`] is flexible and accommodates loading different models or schedulers. You can experiment with different schedulers to optimize for generation speed or quality, and you can replace models with more performant ones.
|
||||
|
||||
```bash
|
||||
git-lfs install
|
||||
git clone https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5
|
||||
```
|
||||
|
||||
This creates a local folder, ./stable-diffusion-v1-5, on your disk and you should pass its path to [`~DiffusionPipeline.from_pretrained`].
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
stable_diffusion = DiffusionPipeline.from_pretrained("./stable-diffusion-v1-5", use_safetensors=True)
|
||||
```
|
||||
|
||||
The [`~DiffusionPipeline.from_pretrained`] method won't download files from the Hub when it detects a local path, but this also means it won't download and cache the latest changes to a checkpoint.
|
||||
|
||||
## Customize a pipeline
|
||||
|
||||
You can customize a pipeline by loading different components into it. This is important because you can:
|
||||
|
||||
- change to a scheduler with faster generation speed or higher generation quality depending on your needs (call the `scheduler.compatibles` method on your pipeline to see compatible schedulers)
|
||||
- change a default pipeline component to a newer and better performing one
|
||||
|
||||
For example, let's customize the default [stabilityai/stable-diffusion-xl-base-1.0](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0) checkpoint with:
|
||||
|
||||
- The [`HeunDiscreteScheduler`] to generate higher quality images at the expense of slower generation speed. You must pass the `subfolder="scheduler"` parameter in [`~HeunDiscreteScheduler.from_pretrained`] to load the scheduler configuration into the correct [subfolder](https://hf.co/stabilityai/stable-diffusion-xl-base-1.0/tree/main/scheduler) of the pipeline repository.
|
||||
- A more stable VAE that runs in fp16.
|
||||
The example below swaps the default scheduler to generate higher quality images and a more stable VAE version. Pass the `subfolder` argument in [`~HeunDiscreteScheduler.from_pretrained`] to load the scheduler to the correct subfolder.
|
||||
|
||||
```py
|
||||
from diffusers import StableDiffusionXLPipeline, HeunDiscreteScheduler, AutoencoderKL
|
||||
import torch
|
||||
from diffusers import DiffusionPipeline, HeunDiscreteScheduler, AutoModel
|
||||
|
||||
scheduler = HeunDiscreteScheduler.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler")
|
||||
vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16, use_safetensors=True)
|
||||
```
|
||||
scheduler = HeunDiscreteScheduler.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", subfolder="scheduler"
|
||||
)
|
||||
vae = AutoModel.from_pretrained(
|
||||
"madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch.float16
|
||||
)
|
||||
|
||||
Now pass the new scheduler and VAE to the [`StableDiffusionXLPipeline`].
|
||||
|
||||
```py
|
||||
pipeline = StableDiffusionXLPipeline.from_pretrained(
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0",
|
||||
scheduler=scheduler,
|
||||
vae=vae,
|
||||
torch_dtype=torch.float16,
|
||||
variant="fp16",
|
||||
use_safetensors=True
|
||||
).to("cuda")
|
||||
device_map="cuda"
|
||||
)
|
||||
```
|
||||
|
||||
## Reuse a pipeline
|
||||
## Reusing models in multiple pipelines
|
||||
|
||||
When you load multiple pipelines that share the same model components, it makes sense to reuse the shared components instead of reloading everything into memory again, especially if your hardware is memory-constrained. For example:
|
||||
When working with multiple pipelines that use the same model, the [`~DiffusionPipeline.from_pipe`] method enables reusing a model instead of reloading it each time. This allows you to use multiple pipelines without increasing memory usage.
|
||||
|
||||
1. You generated an image with the [`StableDiffusionPipeline`] but you want to improve its quality with the [`StableDiffusionSAGPipeline`]. Both of these pipelines share the same pretrained model, so it'd be a waste of memory to load the same model twice.
|
||||
2. You want to add a model component, like a [`MotionAdapter`](../api/pipelines/animatediff#animatediffpipeline), to [`AnimateDiffPipeline`] which was instantiated from an existing [`StableDiffusionPipeline`]. Again, both pipelines share the same pretrained model, so it'd be a waste of memory to load an entirely new pipeline again.
|
||||
Memory usage is determined by the pipeline with the highest memory requirement regardless of the number of pipelines.
|
||||
|
||||
With the [`DiffusionPipeline.from_pipe`] API, you can switch between multiple pipelines to take advantage of their different features without increasing memory-usage. It is similar to turning on and off a feature in your pipeline.
|
||||
|
||||
> [!TIP]
|
||||
> To switch between tasks (rather than features), use the [`~DiffusionPipeline.from_pipe`] method with the [AutoPipeline](../api/pipelines/auto_pipeline) class, which automatically identifies the pipeline class based on the task (learn more in the [AutoPipeline](../tutorials/autopipeline) tutorial).
|
||||
|
||||
Let's start with a [`StableDiffusionPipeline`] and then reuse the loaded model components to create a [`StableDiffusionSAGPipeline`] to increase generation quality. You'll use the [`StableDiffusionPipeline`] with an [IP-Adapter](./ip_adapter) to generate a bear eating pizza.
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline, StableDiffusionSAGPipeline
|
||||
import torch
|
||||
import gc
|
||||
from diffusers.utils import load_image
|
||||
from accelerate.utils import compute_module_sizes
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/load_neg_embed.png")
|
||||
|
||||
pipe_sd = DiffusionPipeline.from_pretrained("SG161222/Realistic_Vision_V6.0_B1_noVAE", torch_dtype=torch.float16)
|
||||
pipe_sd.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
pipe_sd.set_ip_adapter_scale(0.6)
|
||||
pipe_sd.to("cuda")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
out_sd = pipe_sd(
|
||||
prompt="bear eats pizza",
|
||||
negative_prompt="wrong white balance, dark, sketches,worst quality,low quality",
|
||||
ip_adapter_image=image,
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
out_sd
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_sd_0.png"/>
|
||||
</div>
|
||||
|
||||
For reference, you can check how much memory this process consumed.
|
||||
|
||||
```python
|
||||
def bytes_to_giga_bytes(bytes):
|
||||
return bytes / 1024 / 1024 / 1024
|
||||
print(f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB")
|
||||
"Max memory allocated: 4.406213283538818 GB"
|
||||
```
|
||||
|
||||
Now, reuse the same pipeline components from [`StableDiffusionPipeline`] in [`StableDiffusionSAGPipeline`] with the [`~DiffusionPipeline.from_pipe`] method.
|
||||
The example below loads a pipeline and then loads a second pipeline with [`~DiffusionPipeline.from_pipe`] to use [perturbed-attention guidance (PAG)](../api/pipelines/pag) to improve generation quality.
|
||||
|
||||
> [!WARNING]
|
||||
> Some pipeline methods may not function properly on new pipelines created with [`~DiffusionPipeline.from_pipe`]. For instance, the [`~DiffusionPipeline.enable_model_cpu_offload`] method installs hooks on the model components based on a unique offloading sequence for each pipeline. If the models are executed in a different order in the new pipeline, the CPU offloading may not work correctly.
|
||||
>
|
||||
> To ensure everything works as expected, we recommend re-applying a pipeline method on a new pipeline created with [`~DiffusionPipeline.from_pipe`].
|
||||
> Use [`AutoPipelineForText2Image`] because [`DiffusionPipeline`] doesn't support PAG. Refer to the [AutoPipeline](../tutorials/autopipeline) docs to learn more.
|
||||
|
||||
```python
|
||||
pipe_sag = StableDiffusionSAGPipeline.from_pipe(
|
||||
pipe_sd
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoPipelineForText2Image
|
||||
|
||||
pipeline_sdxl = AutoPipelineForText2Image.from_pretrained(
|
||||
"stabilityai/stable-diffusion-xl-base-1.0", torch_dtype=torch.float16, device_map="cuda"
|
||||
)
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
out_sag = pipe_sag(
|
||||
prompt="bear eats pizza",
|
||||
negative_prompt="wrong white balance, dark, sketches,worst quality,low quality",
|
||||
ip_adapter_image=image,
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
guidance_scale=1.0,
|
||||
sag_scale=0.75
|
||||
).images[0]
|
||||
out_sag
|
||||
prompt = """
|
||||
cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
|
||||
highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
|
||||
"""
|
||||
image = pipeline_sdxl(prompt).images[0]
|
||||
print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
|
||||
# Max memory reserved: 10.47 GB
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_sag_1.png"/>
|
||||
</div>
|
||||
|
||||
If you check the memory usage, you'll see it remains the same as before because [`StableDiffusionPipeline`] and [`StableDiffusionSAGPipeline`] are sharing the same pipeline components. This allows you to use them interchangeably without any additional memory overhead.
|
||||
Set `enable_pag=True` in the second pipeline to enable PAG. The second pipeline uses the same amount of memory because it shares model weights with the first one.
|
||||
|
||||
```py
|
||||
print(f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB")
|
||||
"Max memory allocated: 4.406213283538818 GB"
|
||||
pipeline = AutoPipelineForText2Image.from_pipe(
|
||||
pipeline_sdxl, enable_pag=True
|
||||
)
|
||||
prompt = """
|
||||
cinematic film still of a cat sipping a margarita in a pool in Palm Springs, California
|
||||
highly detailed, high budget hollywood movie, cinemascope, moody, epic, gorgeous, film grain
|
||||
"""
|
||||
image = pipeline(prompt).images[0]
|
||||
print(f"Max memory reserved: {torch.cuda.max_memory_allocated() / 1024**3:.2f} GB")
|
||||
# Max memory reserved: 10.47 GB
|
||||
```
|
||||
|
||||
Let's animate the image with the [`AnimateDiffPipeline`] and also add a [`MotionAdapter`] module to the pipeline. For the [`AnimateDiffPipeline`], you need to unload the IP-Adapter first and reload it *after* you've created your new pipeline (this only applies to the [`AnimateDiffPipeline`]).
|
||||
> [!WARNING]
|
||||
> Pipelines created by [`~DiffusionPipeline.from_pipe`] share the same models and *state*. Modifying the state of a model in one pipeline affects all the other pipelines that share the same model.
|
||||
|
||||
```py
|
||||
from diffusers import AnimateDiffPipeline, MotionAdapter, DDIMScheduler
|
||||
from diffusers.utils import export_to_gif
|
||||
|
||||
pipe_sag.unload_ip_adapter()
|
||||
adapter = MotionAdapter.from_pretrained("guoyww/animatediff-motion-adapter-v1-5-2", torch_dtype=torch.float16)
|
||||
|
||||
pipe_animate = AnimateDiffPipeline.from_pipe(pipe_sd, motion_adapter=adapter)
|
||||
pipe_animate.scheduler = DDIMScheduler.from_config(pipe_animate.scheduler.config, beta_schedule="linear")
|
||||
# load IP-Adapter and LoRA weights again
|
||||
pipe_animate.load_ip_adapter("h94/IP-Adapter", subfolder="models", weight_name="ip-adapter_sd15.bin")
|
||||
pipe_animate.load_lora_weights("guoyww/animatediff-motion-lora-zoom-out", adapter_name="zoom-out")
|
||||
pipe_animate.to("cuda")
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
pipe_animate.set_adapters("zoom-out", adapter_weights=0.75)
|
||||
out = pipe_animate(
|
||||
prompt="bear eats pizza",
|
||||
num_frames=16,
|
||||
num_inference_steps=50,
|
||||
ip_adapter_image=image,
|
||||
generator=generator,
|
||||
).frames[0]
|
||||
export_to_gif(out, "out_animate.gif")
|
||||
```
|
||||
|
||||
<div class="flex justify-center">
|
||||
<img class="rounded-xl" src="https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/from_pipe_out_animate_3.gif"/>
|
||||
</div>
|
||||
|
||||
The [`AnimateDiffPipeline`] is more memory-intensive and consumes 15GB of memory (see the [Memory-usage of from_pipe](#memory-usage-of-from_pipe) section to learn what this means for your memory-usage).
|
||||
|
||||
```py
|
||||
print(f"Max memory allocated: {bytes_to_giga_bytes(torch.cuda.max_memory_allocated())} GB")
|
||||
"Max memory allocated: 15.178664207458496 GB"
|
||||
```
|
||||
|
||||
### Modify from_pipe components
|
||||
|
||||
Pipelines loaded with [`~DiffusionPipeline.from_pipe`] can be customized with different model components or methods. However, whenever you modify the *state* of the model components, it affects all the other pipelines that share the same components. For example, if you call [`~diffusers.loaders.IPAdapterMixin.unload_ip_adapter`] on the [`StableDiffusionSAGPipeline`], you won't be able to use IP-Adapter with the [`StableDiffusionPipeline`] because it's been removed from their shared components.
|
||||
|
||||
```py
|
||||
pipe.sag_unload_ip_adapter()
|
||||
|
||||
generator = torch.Generator(device="cpu").manual_seed(33)
|
||||
out_sd = pipe_sd(
|
||||
prompt="bear eats pizza",
|
||||
negative_prompt="wrong white balance, dark, sketches,worst quality,low quality",
|
||||
ip_adapter_image=image,
|
||||
num_inference_steps=50,
|
||||
generator=generator,
|
||||
).images[0]
|
||||
"AttributeError: 'NoneType' object has no attribute 'image_projection_layers'"
|
||||
```
|
||||
|
||||
### Memory usage of from_pipe
|
||||
|
||||
The memory requirement of loading multiple pipelines with [`~DiffusionPipeline.from_pipe`] is determined by the pipeline with the highest memory-usage regardless of the number of pipelines you create.
|
||||
|
||||
| Pipeline | Memory usage (GB) |
|
||||
|---|---|
|
||||
| StableDiffusionPipeline | 4.400 |
|
||||
| StableDiffusionSAGPipeline | 4.400 |
|
||||
| AnimateDiffPipeline | 15.178 |
|
||||
|
||||
The [`AnimateDiffPipeline`] has the highest memory requirement, so the *total memory-usage* is based only on the [`AnimateDiffPipeline`]. Your memory-usage will not increase if you create additional pipelines as long as their memory requirements doesn't exceed that of the [`AnimateDiffPipeline`]. Each pipeline can be used interchangeably without any additional memory overhead.
|
||||
Some methods may not work correctly on pipelines created with [`~DiffusionPipeline.from_pipe`]. For example, [`~DiffusionPipeline.enable_model_cpu_offload`] relies on a unique model execution order, which may differ in the new pipeline. To ensure proper functionality, reapply these methods on the new pipeline.
|
||||
|
||||
## Safety checker
|
||||
|
||||
Diffusers implements a [safety checker](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) for Stable Diffusion models which can generate harmful content. The safety checker screens the generated output against known hardcoded not-safe-for-work (NSFW) content. If for whatever reason you'd like to disable the safety checker, pass `safety_checker=None` to the [`~DiffusionPipeline.from_pretrained`] method.
|
||||
Diffusers provides a [safety checker](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/safety_checker.py) for older Stable Diffusion models to prevent generating harmful content. It screens the generated output against a set of hardcoded harmful concepts.
|
||||
|
||||
```python
|
||||
If you want to disable the safety checker, pass `safety_checker=None` in [`~DiffusionPipeline.from_pretrained`] as shown below.
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None, use_safetensors=True)
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5", safety_checker=None
|
||||
)
|
||||
"""
|
||||
You have disabled the safety checker for <class 'diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline'> by passing `safety_checker=None`. Ensure that you abide by the conditions of the Stable Diffusion license and do not expose unfiltered results in services or applications open to the public. Both the diffusers team and Hugging Face strongly recommend keeping the safety filter enabled in all public-facing circumstances, disabling it only for use cases that involve analyzing network behavior or auditing its results. For more information, please have a look at https://github.com/huggingface/diffusers/pull/254 .
|
||||
"""
|
||||
```
|
||||
|
||||
## Checkpoint variants
|
||||
|
||||
A checkpoint variant is usually a checkpoint whose weights are:
|
||||
|
||||
- Stored in a different floating point type, such as [torch.float16](https://pytorch.org/docs/stable/tensors.html#data-types), because it only requires half the bandwidth and storage to download. You can't use this variant if you're continuing training or using a CPU.
|
||||
- Non-exponential mean averaged (EMA) weights which shouldn't be used for inference. You should use this variant to continue finetuning a model.
|
||||
|
||||
> [!TIP]
|
||||
> When the checkpoints have identical model structures, but they were trained on different datasets and with a different training setup, they should be stored in separate repositories. For example, [stabilityai/stable-diffusion-2](https://hf.co/stabilityai/stable-diffusion-2) and [stabilityai/stable-diffusion-2-1](https://hf.co/stabilityai/stable-diffusion-2-1) are stored in separate repositories.
|
||||
|
||||
Otherwise, a variant is **identical** to the original checkpoint. They have exactly the same serialization format (like [safetensors](./using_safetensors)), model structure, and their weights have identical tensor shapes.
|
||||
|
||||
| **checkpoint type** | **weight name** | **argument for loading weights** |
|
||||
|---------------------|---------------------------------------------|----------------------------------|
|
||||
| original | diffusion_pytorch_model.safetensors | |
|
||||
| floating point | diffusion_pytorch_model.fp16.safetensors | `variant`, `torch_dtype` |
|
||||
| non-EMA | diffusion_pytorch_model.non_ema.safetensors | `variant` |
|
||||
|
||||
There are two important arguments for loading variants:
|
||||
|
||||
- `torch_dtype` specifies the floating point precision of the loaded checkpoint. For example, if you want to save bandwidth by loading a fp16 variant, you should set `variant="fp16"` and `torch_dtype=torch.float16` to *convert the weights* to fp16. Otherwise, the fp16 weights are converted to the default fp32 precision.
|
||||
|
||||
If you only set `torch_dtype=torch.float16`, the default fp32 weights are downloaded first and then converted to fp16.
|
||||
|
||||
- `variant` specifies which files should be loaded from the repository. For example, if you want to load a non-EMA variant of a UNet from [stable-diffusion-v1-5/stable-diffusion-v1-5](https://hf.co/stable-diffusion-v1-5/stable-diffusion-v1-5/tree/main/unet), set `variant="non_ema"` to download the `non_ema` file.
|
||||
|
||||
<hfoptions id="variants">
|
||||
<hfoption id="fp16">
|
||||
|
||||
```py
|
||||
from diffusers import DiffusionPipeline
|
||||
import torch
|
||||
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5", variant="fp16", torch_dtype=torch.float16, use_safetensors=True
|
||||
)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="non-EMA">
|
||||
|
||||
```py
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5", variant="non_ema", use_safetensors=True
|
||||
)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
Use the `variant` parameter in the [`DiffusionPipeline.save_pretrained`] method to save a checkpoint as a different floating point type or as a non-EMA variant. You should try save a variant to the same folder as the original checkpoint, so you have the option of loading both from the same folder.
|
||||
|
||||
<hfoptions id="save">
|
||||
<hfoption id="fp16">
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
pipeline.save_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", variant="fp16")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="non_ema">
|
||||
|
||||
```py
|
||||
pipeline.save_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", variant="non_ema")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
If you don't save the variant to an existing folder, you must specify the `variant` argument otherwise it'll throw an `Exception` because it can't find the original checkpoint.
|
||||
|
||||
```python
|
||||
# 👎 this won't work
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"./stable-diffusion-v1-5", torch_dtype=torch.float16, use_safetensors=True
|
||||
)
|
||||
# 👍 this works
|
||||
pipeline = DiffusionPipeline.from_pretrained(
|
||||
"./stable-diffusion-v1-5", variant="fp16", torch_dtype=torch.float16, use_safetensors=True
|
||||
)
|
||||
```
|
||||
|
||||
## DiffusionPipeline explained
|
||||
|
||||
As a class method, [`DiffusionPipeline.from_pretrained`] is responsible for two things:
|
||||
|
||||
- Download the latest version of the folder structure required for inference and cache it. If the latest folder structure is available in the local cache, [`DiffusionPipeline.from_pretrained`] reuses the cache and won't redownload the files.
|
||||
- Load the cached weights into the correct pipeline [class](../api/pipelines/overview#diffusers-summary) - retrieved from the `model_index.json` file - and return an instance of it.
|
||||
|
||||
The pipelines' underlying folder structure corresponds directly with their class instances. For example, the [`StableDiffusionPipeline`] corresponds to the folder structure in [`stable-diffusion-v1-5/stable-diffusion-v1-5`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5).
|
||||
|
||||
```python
|
||||
from diffusers import DiffusionPipeline
|
||||
|
||||
repo_id = "stable-diffusion-v1-5/stable-diffusion-v1-5"
|
||||
pipeline = DiffusionPipeline.from_pretrained(repo_id, use_safetensors=True)
|
||||
print(pipeline)
|
||||
```
|
||||
|
||||
You'll see pipeline is an instance of [`StableDiffusionPipeline`], which consists of seven components:
|
||||
|
||||
- `"feature_extractor"`: a [`~transformers.CLIPImageProcessor`] from 🤗 Transformers.
|
||||
- `"safety_checker"`: a [component](https://github.com/huggingface/diffusers/blob/e55687e1e15407f60f32242027b7bb8170e58266/src/diffusers/pipelines/stable_diffusion/safety_checker.py#L32) for screening against harmful content.
|
||||
- `"scheduler"`: an instance of [`PNDMScheduler`].
|
||||
- `"text_encoder"`: a [`~transformers.CLIPTextModel`] from 🤗 Transformers.
|
||||
- `"tokenizer"`: a [`~transformers.CLIPTokenizer`] from 🤗 Transformers.
|
||||
- `"unet"`: an instance of [`UNet2DConditionModel`].
|
||||
- `"vae"`: an instance of [`AutoencoderKL`].
|
||||
|
||||
```json
|
||||
StableDiffusionPipeline {
|
||||
"feature_extractor": [
|
||||
"transformers",
|
||||
"CLIPImageProcessor"
|
||||
],
|
||||
"safety_checker": [
|
||||
"stable_diffusion",
|
||||
"StableDiffusionSafetyChecker"
|
||||
],
|
||||
"scheduler": [
|
||||
"diffusers",
|
||||
"PNDMScheduler"
|
||||
],
|
||||
"text_encoder": [
|
||||
"transformers",
|
||||
"CLIPTextModel"
|
||||
],
|
||||
"tokenizer": [
|
||||
"transformers",
|
||||
"CLIPTokenizer"
|
||||
],
|
||||
"unet": [
|
||||
"diffusers",
|
||||
"UNet2DConditionModel"
|
||||
],
|
||||
"vae": [
|
||||
"diffusers",
|
||||
"AutoencoderKL"
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
Compare the components of the pipeline instance to the [`stable-diffusion-v1-5/stable-diffusion-v1-5`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/tree/main) folder structure, and you'll see there is a separate folder for each of the components in the repository:
|
||||
|
||||
```
|
||||
.
|
||||
├── feature_extractor
|
||||
│ └── preprocessor_config.json
|
||||
├── model_index.json
|
||||
├── safety_checker
|
||||
│ ├── config.json
|
||||
| ├── model.fp16.safetensors
|
||||
│ ├── model.safetensors
|
||||
│ ├── pytorch_model.bin
|
||||
| └── pytorch_model.fp16.bin
|
||||
├── scheduler
|
||||
│ └── scheduler_config.json
|
||||
├── text_encoder
|
||||
│ ├── config.json
|
||||
| ├── model.fp16.safetensors
|
||||
│ ├── model.safetensors
|
||||
│ |── pytorch_model.bin
|
||||
| └── pytorch_model.fp16.bin
|
||||
├── tokenizer
|
||||
│ ├── merges.txt
|
||||
│ ├── special_tokens_map.json
|
||||
│ ├── tokenizer_config.json
|
||||
│ └── vocab.json
|
||||
├── unet
|
||||
│ ├── config.json
|
||||
│ ├── diffusion_pytorch_model.bin
|
||||
| |── diffusion_pytorch_model.fp16.bin
|
||||
│ |── diffusion_pytorch_model.f16.safetensors
|
||||
│ |── diffusion_pytorch_model.non_ema.bin
|
||||
│ |── diffusion_pytorch_model.non_ema.safetensors
|
||||
│ └── diffusion_pytorch_model.safetensors
|
||||
|── vae
|
||||
. ├── config.json
|
||||
. ├── diffusion_pytorch_model.bin
|
||||
├── diffusion_pytorch_model.fp16.bin
|
||||
├── diffusion_pytorch_model.fp16.safetensors
|
||||
└── diffusion_pytorch_model.safetensors
|
||||
```
|
||||
|
||||
You can access each of the components of the pipeline as an attribute to view its configuration:
|
||||
|
||||
```py
|
||||
pipeline.tokenizer
|
||||
CLIPTokenizer(
|
||||
name_or_path="/root/.cache/huggingface/hub/models--runwayml--stable-diffusion-v1-5/snapshots/39593d5650112b4cc580433f6b0435385882d819/tokenizer",
|
||||
vocab_size=49408,
|
||||
model_max_length=77,
|
||||
is_fast=False,
|
||||
padding_side="right",
|
||||
truncation_side="right",
|
||||
special_tokens={
|
||||
"bos_token": AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True),
|
||||
"eos_token": AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True),
|
||||
"unk_token": AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True),
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
clean_up_tokenization_spaces=True
|
||||
)
|
||||
```
|
||||
|
||||
Every pipeline expects a [`model_index.json`](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5/blob/main/model_index.json) file that tells the [`DiffusionPipeline`]:
|
||||
|
||||
- which pipeline class to load from `_class_name`
|
||||
- which version of 🧨 Diffusers was used to create the model in `_diffusers_version`
|
||||
- what components from which library are stored in the subfolders (`name` corresponds to the component and subfolder name, `library` corresponds to the name of the library to load the class from, and `class` corresponds to the class name)
|
||||
|
||||
```json
|
||||
{
|
||||
"_class_name": "StableDiffusionPipeline",
|
||||
"_diffusers_version": "0.6.0",
|
||||
"feature_extractor": [
|
||||
"transformers",
|
||||
"CLIPImageProcessor"
|
||||
],
|
||||
"safety_checker": [
|
||||
"stable_diffusion",
|
||||
"StableDiffusionSafetyChecker"
|
||||
],
|
||||
"scheduler": [
|
||||
"diffusers",
|
||||
"PNDMScheduler"
|
||||
],
|
||||
"text_encoder": [
|
||||
"transformers",
|
||||
"CLIPTextModel"
|
||||
],
|
||||
"tokenizer": [
|
||||
"transformers",
|
||||
"CLIPTokenizer"
|
||||
],
|
||||
"unet": [
|
||||
"diffusers",
|
||||
"UNet2DConditionModel"
|
||||
],
|
||||
"vae": [
|
||||
"diffusers",
|
||||
"AutoencoderKL"
|
||||
]
|
||||
}
|
||||
```
|
||||
```
|
||||
@@ -176,7 +176,7 @@ Benefits of using the Diffusers-multifolder layout include:
|
||||
).to("cuda")
|
||||
turbo_pipeline.scheduler = EulerDiscreteScheduler.from_config(
|
||||
turbo_pipeline.scheduler.config,
|
||||
timestep+spacing="trailing"
|
||||
timestep_spacing="trailing"
|
||||
)
|
||||
image = turbo_pipeline(
|
||||
"an astronaut riding a unicorn on mars",
|
||||
@@ -267,6 +267,7 @@ pipe = DiffusionPipeline.from_pretrained("black-forest-labs/FLUX.1-dev", torch_d
|
||||
save_folder = "flux-dev"
|
||||
pipe.save_pretrained("flux-dev")
|
||||
export_folder_as_dduf("flux-dev.dduf", folder_path=save_folder)
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> Packaging and loading quantized checkpoints in the DDUF format is supported as long as they respect the multi-folder structure.
|
||||
|
||||
@@ -10,129 +10,86 @@ an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express o
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# Reproducible pipelines
|
||||
# Reproducibility
|
||||
|
||||
Diffusion models are inherently random which is what allows it to generate different outputs every time it is run. But there are certain times when you want to generate the same output every time, like when you're testing, replicating results, and even [improving image quality](#deterministic-batch-generation). While you can't expect to get identical results across platforms, you can expect reproducible results across releases and platforms within a certain tolerance range (though even this may vary).
|
||||
Diffusion is a random process that generates a different output every time. For certain situations like testing and replicating results, you want to generate the same result each time, across releases and platforms within a certain tolerance range.
|
||||
|
||||
This guide will show you how to control randomness for deterministic generation on a CPU and GPU.
|
||||
This guide will show you how to control sources of randomness and enable deterministic algorithms.
|
||||
|
||||
## Generator
|
||||
|
||||
Pipelines rely on [torch.randn](https://pytorch.org/docs/stable/generated/torch.randn.html), which uses a different random seed each time, to create the initial noisy tensors. To generate the same output on a CPU or GPU, use a [Generator](https://docs.pytorch.org/docs/stable/generated/torch.Generator.html) to manage how random values are generated.
|
||||
|
||||
> [!TIP]
|
||||
> We strongly recommend reading PyTorch's [statement about reproducibility](https://pytorch.org/docs/stable/notes/randomness.html):
|
||||
>
|
||||
> "Completely reproducible results are not guaranteed across PyTorch releases, individual commits, or different platforms. Furthermore, results may not be reproducible between CPU and GPU executions, even when using identical seeds."
|
||||
> If reproducibility is important to your use case, we recommend always using a CPU `Generator`. The performance loss is often negligible and you'll generate more similar values.
|
||||
|
||||
## Control randomness
|
||||
<hfoptions id="generator">
|
||||
<hfoption id="GPU">
|
||||
|
||||
During inference, pipelines rely heavily on random sampling operations which include creating the
|
||||
Gaussian noise tensors to denoise and adding noise to the scheduling step.
|
||||
The GPU uses a different random number generator than the CPU. Diffusers solves this issue with the [`~utils.torch_utils.randn_tensor`] function to create the random tensor on a CPU and then moving it to the GPU. This function is used everywhere inside the pipeline and you don't need to explicitly call it.
|
||||
|
||||
Take a look at the tensor values in the [`DDIMPipeline`] after two inference steps.
|
||||
Use [manual_seed](https://docs.pytorch.org/docs/stable/generated/torch.manual_seed.html) as shown below to set a seed.
|
||||
|
||||
```python
|
||||
from diffusers import DDIMPipeline
|
||||
import numpy as np
|
||||
|
||||
ddim = DDIMPipeline.from_pretrained( "google/ddpm-cifar10-32", use_safetensors=True)
|
||||
image = ddim(num_inference_steps=2, output_type="np").images
|
||||
print(np.abs(image).sum())
|
||||
```
|
||||
|
||||
Running the code above prints one value, but if you run it again you get a different value.
|
||||
|
||||
Each time the pipeline is run, [torch.randn](https://pytorch.org/docs/stable/generated/torch.randn.html) uses a different random seed to create the Gaussian noise tensors. This leads to a different result each time it is run and enables the diffusion pipeline to generate a different random image each time.
|
||||
|
||||
But if you need to reliably generate the same image, that depends on whether you're running the pipeline on a CPU or GPU.
|
||||
|
||||
> [!TIP]
|
||||
> It might seem unintuitive to pass `Generator` objects to a pipeline instead of the integer value representing the seed. However, this is the recommended design when working with probabilistic models in PyTorch because a `Generator` is a *random state* that can be passed to multiple pipelines in a sequence. As soon as the `Generator` is consumed, the *state* is changed in place which means even if you passed the same `Generator` to a different pipeline, it won't produce the same result because the state is already changed.
|
||||
|
||||
<hfoptions id="hardware">
|
||||
<hfoption id="CPU">
|
||||
|
||||
To generate reproducible results on a CPU, you'll need to use a PyTorch [Generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) and set a seed. Now when you run the code, it always prints a value of `1491.1711` because the `Generator` object with the seed is passed to all the random functions in the pipeline. You should get a similar, if not the same, result on whatever hardware and PyTorch version you're using.
|
||||
|
||||
```python
|
||||
```py
|
||||
import torch
|
||||
import numpy as np
|
||||
from diffusers import DDIMPipeline
|
||||
|
||||
ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
|
||||
ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", device_map="cuda")
|
||||
generator = torch.manual_seed(0)
|
||||
image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
|
||||
print(np.abs(image).sum())
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="CPU">
|
||||
|
||||
Set `device="cpu"` in the `Generator` and use [manual_seed](https://docs.pytorch.org/docs/stable/generated/torch.manual_seed.html) to set a seed for generating random numbers.
|
||||
|
||||
```py
|
||||
import torch
|
||||
import numpy as np
|
||||
from diffusers import DDIMPipeline
|
||||
|
||||
ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32")
|
||||
generator = torch.Generator(device="cpu").manual_seed(0)
|
||||
image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
|
||||
print(np.abs(image).sum())
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="GPU">
|
||||
|
||||
Writing a reproducible pipeline on a GPU is a bit trickier, and full reproducibility across different hardware is not guaranteed because matrix multiplication - which diffusion pipelines require a lot of - is less deterministic on a GPU than a CPU. For example, if you run the same code example from the CPU example, you'll get a different result even though the seed is identical. This is because the GPU uses a different random number generator than the CPU.
|
||||
|
||||
```python
|
||||
import torch
|
||||
import numpy as np
|
||||
from diffusers import DDIMPipeline
|
||||
|
||||
ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
|
||||
ddim.to("cuda")
|
||||
generator = torch.Generator(device="cuda").manual_seed(0)
|
||||
image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
|
||||
print(np.abs(image).sum())
|
||||
```
|
||||
|
||||
To avoid this issue, Diffusers has a [`~utils.torch_utils.randn_tensor`] function for creating random noise on the CPU, and then moving the tensor to a GPU if necessary. The [`~utils.torch_utils.randn_tensor`] function is used everywhere inside the pipeline. Now you can call [torch.manual_seed](https://pytorch.org/docs/stable/generated/torch.manual_seed.html) which automatically creates a CPU `Generator` that can be passed to the pipeline even if it is being run on a GPU.
|
||||
|
||||
```python
|
||||
import torch
|
||||
import numpy as np
|
||||
from diffusers import DDIMPipeline
|
||||
|
||||
ddim = DDIMPipeline.from_pretrained("google/ddpm-cifar10-32", use_safetensors=True)
|
||||
ddim.to("cuda")
|
||||
generator = torch.manual_seed(0)
|
||||
image = ddim(num_inference_steps=2, output_type="np", generator=generator).images
|
||||
print(np.abs(image).sum())
|
||||
```
|
||||
|
||||
> [!TIP]
|
||||
> If reproducibility is important to your use case, we recommend always passing a CPU `Generator`. The performance loss is often negligible and you'll generate more similar values than if the pipeline had been run on a GPU.
|
||||
|
||||
Finally, more complex pipelines such as [`UnCLIPPipeline`], are often extremely
|
||||
susceptible to precision error propagation. You'll need to use
|
||||
exactly the same hardware and PyTorch version for full reproducibility.
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
The `Generator` object should be passed to the pipeline instead of an integer seed. `Generator` maintains a *random state* that is consumed and modified when used. Once consumed, the same `Generator` object produces different results in subsequent calls, even across different pipelines, because it's *state* has changed.
|
||||
|
||||
```py
|
||||
generator = torch.manual_seed(0)
|
||||
|
||||
for _ in range(5):
|
||||
- image = pipeline(prompt, generator=generator)
|
||||
+ image = pipeline(prompt, generator=torch.manual_seed(0))
|
||||
```
|
||||
|
||||
## Deterministic algorithms
|
||||
|
||||
You can also configure PyTorch to use deterministic algorithms to create a reproducible pipeline. The downside is that deterministic algorithms may be slower than non-deterministic ones and you may observe a decrease in performance.
|
||||
PyTorch supports [deterministic algorithms](https://docs.pytorch.org/docs/stable/notes/randomness.html#avoiding-nondeterministic-algorithms) - where available - for certain operations so they produce the same results. Deterministic algorithms may be slower and decrease performance.
|
||||
|
||||
Non-deterministic behavior occurs when operations are launched in more than one CUDA stream. To avoid this, set the environment variable [CUBLAS_WORKSPACE_CONFIG](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility) to `:16:8` to only use one buffer size during runtime.
|
||||
|
||||
PyTorch typically benchmarks multiple algorithms to select the fastest one, but if you want reproducibility, you should disable this feature because the benchmark may select different algorithms each time. Set Diffusers [enable_full_determinism](https://github.com/huggingface/diffusers/blob/142f353e1c638ff1d20bd798402b68f72c1ebbdd/src/diffusers/utils/testing_utils.py#L861) to enable deterministic algorithms.
|
||||
|
||||
```py
|
||||
enable_full_determinism()
|
||||
```
|
||||
|
||||
Now when you run the same pipeline twice, you'll get identical results.
|
||||
Use Diffusers' [enable_full_determinism](https://github.com/huggingface/diffusers/blob/142f353e1c638ff1d20bd798402b68f72c1ebbdd/src/diffusers/utils/testing_utils.py#L861) function to enable deterministic algorithms.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import DDIMScheduler, StableDiffusionPipeline
|
||||
from diffusers_utils import enable_full_determinism
|
||||
|
||||
pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", use_safetensors=True).to("cuda")
|
||||
pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
|
||||
g = torch.Generator(device="cuda")
|
||||
|
||||
prompt = "A bear is playing a guitar on Times Square"
|
||||
|
||||
g.manual_seed(0)
|
||||
result1 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
|
||||
|
||||
g.manual_seed(0)
|
||||
result2 = pipe(prompt=prompt, num_inference_steps=50, generator=g, output_type="latent").images
|
||||
|
||||
print("L_inf dist =", abs(result1 - result2).max())
|
||||
"L_inf dist = tensor(0., device='cuda:0')"
|
||||
enable_full_determinism()
|
||||
```
|
||||
|
||||
Under the hood, `enable_full_determinism` works by:
|
||||
|
||||
- Setting the environment variable [CUBLAS_WORKSPACE_CONFIG](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility) to `:16:8` to only use one buffer size during rntime. Non-deterministic behavior occurs when operations are used in more than one CUDA stream.
|
||||
- Disabling benchmarking to find the fastest convolution operation by setting `torch.backends.cudnn.benchmark=False`. Non-deterministic behavior occurs because the benchmark may select different algorithms each time depending on hardware or benchmarking noise.
|
||||
- Disabling TensorFloat32 (TF32) operations in favor of more precise and consistent full-precision operations.
|
||||
|
||||
|
||||
## Resources
|
||||
|
||||
We strongly recommend reading PyTorch's developer notes about [Reproducibility](https://docs.pytorch.org/docs/stable/notes/randomness.html). You can try to limit randomness, but it is not *guaranteed* even with an identical seed.
|
||||
@@ -165,53 +165,6 @@ image
|
||||
|
||||
Most images look very similar and are comparable in quality. Again, it often comes down to your specific use case so a good approach is to run multiple different schedulers and compare the results.
|
||||
|
||||
### Flax schedulers
|
||||
|
||||
To compare Flax schedulers, you need to additionally load the scheduler state into the model parameters. For example, let's change the default scheduler in [`FlaxStableDiffusionPipeline`] to use the super fast [`FlaxDPMSolverMultistepScheduler`].
|
||||
|
||||
> [!WARNING]
|
||||
> The [`FlaxLMSDiscreteScheduler`] and [`FlaxDDPMScheduler`] are not compatible with the [`FlaxStableDiffusionPipeline`] yet.
|
||||
|
||||
```py
|
||||
import jax
|
||||
import numpy as np
|
||||
from flax.jax_utils import replicate
|
||||
from flax.training.common_utils import shard
|
||||
from diffusers import FlaxStableDiffusionPipeline, FlaxDPMSolverMultistepScheduler
|
||||
|
||||
scheduler, scheduler_state = FlaxDPMSolverMultistepScheduler.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
subfolder="scheduler"
|
||||
)
|
||||
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
|
||||
"stable-diffusion-v1-5/stable-diffusion-v1-5",
|
||||
scheduler=scheduler,
|
||||
variant="bf16",
|
||||
dtype=jax.numpy.bfloat16,
|
||||
)
|
||||
params["scheduler"] = scheduler_state
|
||||
```
|
||||
|
||||
Then you can take advantage of Flax's compatibility with TPUs to generate a number of images in parallel. You'll need to make a copy of the model parameters for each available device and then split the inputs across them to generate your desired number of images.
|
||||
|
||||
```py
|
||||
# Generate 1 image per parallel device (8 on TPUv2-8 or TPUv3-8)
|
||||
prompt = "A photograph of an astronaut riding a horse on Mars, high resolution, high definition."
|
||||
num_samples = jax.device_count()
|
||||
prompt_ids = pipeline.prepare_inputs([prompt] * num_samples)
|
||||
|
||||
prng_seed = jax.random.PRNGKey(0)
|
||||
num_inference_steps = 25
|
||||
|
||||
# shard inputs and rng
|
||||
params = replicate(params)
|
||||
prng_seed = jax.random.split(prng_seed, jax.device_count())
|
||||
prompt_ids = shard(prompt_ids)
|
||||
|
||||
images = pipeline(prompt_ids, params, prng_seed, num_inference_steps, jit=True).images
|
||||
images = pipeline.numpy_to_pil(np.asarray(images.reshape((num_samples,) + images.shape[-3:])))
|
||||
```
|
||||
|
||||
## Models
|
||||
|
||||
Models are loaded from the [`ModelMixin.from_pretrained`] method, which downloads and caches the latest version of the model weights and configurations. If the latest files are available in the local cache, [`~ModelMixin.from_pretrained`] reuses files in the cache instead of re-downloading them.
|
||||
|
||||
@@ -1,225 +0,0 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# JAX/Flax
|
||||
|
||||
[[open-in-colab]]
|
||||
|
||||
🤗 Diffusers supports Flax for super fast inference on Google TPUs, such as those available in Colab, Kaggle or Google Cloud Platform. This guide shows you how to run inference with Stable Diffusion using JAX/Flax.
|
||||
|
||||
Before you begin, make sure you have the necessary libraries installed:
|
||||
|
||||
```py
|
||||
# uncomment to install the necessary libraries in Colab
|
||||
#!pip install -q jax==0.3.25 jaxlib==0.3.25 flax transformers ftfy
|
||||
#!pip install -q diffusers
|
||||
```
|
||||
|
||||
You should also make sure you're using a TPU backend. While JAX does not run exclusively on TPUs, you'll get the best performance on a TPU because each server has 8 TPU accelerators working in parallel.
|
||||
|
||||
If you are running this guide in Colab, select *Runtime* in the menu above, select the option *Change runtime type*, and then select *TPU* under the *Hardware accelerator* setting. Import JAX and quickly check whether you're using a TPU:
|
||||
|
||||
```python
|
||||
import jax
|
||||
import jax.tools.colab_tpu
|
||||
jax.tools.colab_tpu.setup_tpu()
|
||||
|
||||
num_devices = jax.device_count()
|
||||
device_type = jax.devices()[0].device_kind
|
||||
|
||||
print(f"Found {num_devices} JAX devices of type {device_type}.")
|
||||
assert (
|
||||
"TPU" in device_type,
|
||||
"Available device is not a TPU, please select TPU from Runtime > Change runtime type > Hardware accelerator"
|
||||
)
|
||||
# Found 8 JAX devices of type Cloud TPU.
|
||||
```
|
||||
|
||||
Great, now you can import the rest of the dependencies you'll need:
|
||||
|
||||
```python
|
||||
import jax.numpy as jnp
|
||||
from jax import pmap
|
||||
from flax.jax_utils import replicate
|
||||
from flax.training.common_utils import shard
|
||||
|
||||
from diffusers import FlaxStableDiffusionPipeline
|
||||
```
|
||||
|
||||
## Load a model
|
||||
|
||||
Flax is a functional framework, so models are stateless and parameters are stored outside of them. Loading a pretrained Flax pipeline returns *both* the pipeline and the model weights (or parameters). In this guide, you'll use `bfloat16`, a more efficient half-float type that is supported by TPUs (you can also use `float32` for full precision if you want).
|
||||
|
||||
```python
|
||||
dtype = jnp.bfloat16
|
||||
pipeline, params = FlaxStableDiffusionPipeline.from_pretrained(
|
||||
"CompVis/stable-diffusion-v1-4",
|
||||
variant="bf16",
|
||||
dtype=dtype,
|
||||
)
|
||||
```
|
||||
|
||||
## Inference
|
||||
|
||||
TPUs usually have 8 devices working in parallel, so let's use the same prompt for each device. This means you can perform inference on 8 devices at once, with each device generating one image. As a result, you'll get 8 images in the same amount of time it takes for one chip to generate a single image!
|
||||
|
||||
<Tip>
|
||||
|
||||
Learn more details in the [How does parallelization work?](#how-does-parallelization-work) section.
|
||||
|
||||
</Tip>
|
||||
|
||||
After replicating the prompt, get the tokenized text ids by calling the `prepare_inputs` function on the pipeline. The length of the tokenized text is set to 77 tokens as required by the configuration of the underlying CLIP text model.
|
||||
|
||||
```python
|
||||
prompt = "A cinematic film still of Morgan Freeman starring as Jimi Hendrix, portrait, 40mm lens, shallow depth of field, close up, split lighting, cinematic"
|
||||
prompt = [prompt] * jax.device_count()
|
||||
prompt_ids = pipeline.prepare_inputs(prompt)
|
||||
prompt_ids.shape
|
||||
# (8, 77)
|
||||
```
|
||||
|
||||
Model parameters and inputs have to be replicated across the 8 parallel devices. The parameters dictionary is replicated with [`flax.jax_utils.replicate`](https://flax.readthedocs.io/en/latest/api_reference/flax.jax_utils.html#flax.jax_utils.replicate) which traverses the dictionary and changes the shape of the weights so they are repeated 8 times. Arrays are replicated using `shard`.
|
||||
|
||||
```python
|
||||
# parameters
|
||||
p_params = replicate(params)
|
||||
|
||||
# arrays
|
||||
prompt_ids = shard(prompt_ids)
|
||||
prompt_ids.shape
|
||||
# (8, 1, 77)
|
||||
```
|
||||
|
||||
This shape means each one of the 8 devices receives as an input a `jnp` array with shape `(1, 77)`, where `1` is the batch size per device. On TPUs with sufficient memory, you could have a batch size larger than `1` if you want to generate multiple images (per chip) at once.
|
||||
|
||||
Next, create a random number generator to pass to the generation function. This is standard procedure in Flax, which is very serious and opinionated about random numbers. All functions that deal with random numbers are expected to receive a generator to ensure reproducibility, even when you're training across multiple distributed devices.
|
||||
|
||||
The helper function below uses a seed to initialize a random number generator. As long as you use the same seed, you'll get the exact same results. Feel free to use different seeds when exploring results later in the guide.
|
||||
|
||||
```python
|
||||
def create_key(seed=0):
|
||||
return jax.random.PRNGKey(seed)
|
||||
```
|
||||
|
||||
The helper function, or `rng`, is split 8 times so each device receives a different generator and generates a different image.
|
||||
|
||||
```python
|
||||
rng = create_key(0)
|
||||
rng = jax.random.split(rng, jax.device_count())
|
||||
```
|
||||
|
||||
To take advantage of JAX's optimized speed on a TPU, pass `jit=True` to the pipeline to compile the JAX code into an efficient representation and to ensure the model runs in parallel across the 8 devices.
|
||||
|
||||
<Tip warning={true}>
|
||||
|
||||
You need to ensure all your inputs have the same shape in subsequent calls, otherwise JAX will need to recompile the code which is slower.
|
||||
|
||||
</Tip>
|
||||
|
||||
The first inference run takes more time because it needs to compile the code, but subsequent calls (even with different inputs) are much faster. For example, it took more than a minute to compile on a TPU v2-8, but then it takes about **7s** on a future inference run!
|
||||
|
||||
```py
|
||||
%%time
|
||||
images = pipeline(prompt_ids, p_params, rng, jit=True)[0]
|
||||
|
||||
# CPU times: user 56.2 s, sys: 42.5 s, total: 1min 38s
|
||||
# Wall time: 1min 29s
|
||||
```
|
||||
|
||||
The returned array has shape `(8, 1, 512, 512, 3)` which should be reshaped to remove the second dimension and get 8 images of `512 × 512 × 3`. Then you can use the [`~utils.numpy_to_pil`] function to convert the arrays into images.
|
||||
|
||||
```python
|
||||
from diffusers.utils import make_image_grid
|
||||
|
||||
images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
|
||||
images = pipeline.numpy_to_pil(images)
|
||||
make_image_grid(images, rows=2, cols=4)
|
||||
```
|
||||
|
||||

|
||||
|
||||
## Using different prompts
|
||||
|
||||
You don't necessarily have to use the same prompt on all devices. For example, to generate 8 different prompts:
|
||||
|
||||
```python
|
||||
prompts = [
|
||||
"Labrador in the style of Hokusai",
|
||||
"Painting of a squirrel skating in New York",
|
||||
"HAL-9000 in the style of Van Gogh",
|
||||
"Times Square under water, with fish and a dolphin swimming around",
|
||||
"Ancient Roman fresco showing a man working on his laptop",
|
||||
"Close-up photograph of young black woman against urban background, high quality, bokeh",
|
||||
"Armchair in the shape of an avocado",
|
||||
"Clown astronaut in space, with Earth in the background",
|
||||
]
|
||||
|
||||
prompt_ids = pipeline.prepare_inputs(prompts)
|
||||
prompt_ids = shard(prompt_ids)
|
||||
|
||||
images = pipeline(prompt_ids, p_params, rng, jit=True).images
|
||||
images = images.reshape((images.shape[0] * images.shape[1],) + images.shape[-3:])
|
||||
images = pipeline.numpy_to_pil(images)
|
||||
|
||||
make_image_grid(images, 2, 4)
|
||||
```
|
||||
|
||||

|
||||
|
||||
## How does parallelization work?
|
||||
|
||||
The Flax pipeline in 🤗 Diffusers automatically compiles the model and runs it in parallel on all available devices. Let's take a closer look at how that process works.
|
||||
|
||||
JAX parallelization can be done in multiple ways. The easiest one revolves around using the [`jax.pmap`](https://jax.readthedocs.io/en/latest/_autosummary/jax.pmap.html) function to achieve single-program multiple-data (SPMD) parallelization. It means running several copies of the same code, each on different data inputs. More sophisticated approaches are possible, and you can go over to the JAX [documentation](https://jax.readthedocs.io/en/latest/index.html) to explore this topic in more detail if you are interested!
|
||||
|
||||
`jax.pmap` does two things:
|
||||
|
||||
1. Compiles (or "`jit`s") the code which is similar to `jax.jit()`. This does not happen when you call `pmap`, and only the first time the `pmap`ped function is called.
|
||||
2. Ensures the compiled code runs in parallel on all available devices.
|
||||
|
||||
To demonstrate, call `pmap` on the pipeline's `_generate` method (this is a private method that generates images and may be renamed or removed in future releases of 🤗 Diffusers):
|
||||
|
||||
```python
|
||||
p_generate = pmap(pipeline._generate)
|
||||
```
|
||||
|
||||
After calling `pmap`, the prepared function `p_generate` will:
|
||||
|
||||
1. Make a copy of the underlying function, `pipeline._generate`, on each device.
|
||||
2. Send each device a different portion of the input arguments (this is why it's necessary to call the *shard* function). In this case, `prompt_ids` has shape `(8, 1, 77, 768)` so the array is split into 8 and each copy of `_generate` receives an input with shape `(1, 77, 768)`.
|
||||
|
||||
The most important thing to pay attention to here is the batch size (1 in this example), and the input dimensions that make sense for your code. You don't have to change anything else to make the code work in parallel.
|
||||
|
||||
The first time you call the pipeline takes more time, but the calls afterward are much faster. The `block_until_ready` function is used to correctly measure inference time because JAX uses asynchronous dispatch and returns control to the Python loop as soon as it can. You don't need to use that in your code; blocking occurs automatically when you want to use the result of a computation that has not yet been materialized.
|
||||
|
||||
```py
|
||||
%%time
|
||||
images = p_generate(prompt_ids, p_params, rng)
|
||||
images = images.block_until_ready()
|
||||
|
||||
# CPU times: user 1min 15s, sys: 18.2 s, total: 1min 34s
|
||||
# Wall time: 1min 15s
|
||||
```
|
||||
|
||||
Check your image dimensions to see if they're correct:
|
||||
|
||||
```python
|
||||
images.shape
|
||||
# (8, 1, 512, 512, 3)
|
||||
```
|
||||
|
||||
## Resources
|
||||
|
||||
To learn more about how JAX works with Stable Diffusion, you may be interested in reading:
|
||||
|
||||
* [Accelerating Stable Diffusion XL Inference with JAX on Cloud TPU v5e](https://hf.co/blog/sdxl_jax)
|
||||
@@ -287,7 +287,7 @@ export_to_video(output, "output.mp4", fps=16)
|
||||
|
||||
## Reduce memory usage
|
||||
|
||||
Recent video models like [`HunyuanVideoPipeline`] and [`WanPipeline`], which have 10B+ parameters, require a lot of memory and it often exceeds the memory availabe on consumer hardware. Diffusers offers several techniques for reducing the memory requirements of these large models.
|
||||
Recent video models like [`HunyuanVideoPipeline`] and [`WanPipeline`], which have 10B+ parameters, require a lot of memory and it often exceeds the memory available on consumer hardware. Diffusers offers several techniques for reducing the memory requirements of these large models.
|
||||
|
||||
> [!TIP]
|
||||
> Refer to the [Reduce memory usage](../optimization/memory) guide for more details about other memory saving techniques.
|
||||
|
||||
@@ -88,6 +88,8 @@ PIXART-α Controlnet pipeline | Implementation of the controlnet model for pixar
|
||||
| FaithDiff Stable Diffusion XL Pipeline | Implementation of [(CVPR 2025) FaithDiff: Unleashing Diffusion Priors for Faithful Image Super-resolutionUnleashing Diffusion Priors for Faithful Image Super-resolution](https://huggingface.co/papers/2411.18824) - FaithDiff is a faithful image super-resolution method that leverages latent diffusion models by actively adapting the diffusion prior and jointly fine-tuning its components (encoder and diffusion model) with an alignment module to ensure high fidelity and structural consistency. | [FaithDiff Stable Diffusion XL Pipeline](#faithdiff-stable-diffusion-xl-pipeline) | [](https://huggingface.co/jychen9811/FaithDiff) | [Junyang Chen, Jinshan Pan, Jiangxin Dong, IMAG Lab, (Adapted by Eliseu Silva)](https://github.com/JyChen9811/FaithDiff) |
|
||||
| Stable Diffusion 3 InstructPix2Pix Pipeline | Implementation of Stable Diffusion 3 InstructPix2Pix Pipeline | [Stable Diffusion 3 InstructPix2Pix Pipeline](#stable-diffusion-3-instructpix2pix-pipeline) | [](https://huggingface.co/BleachNick/SD3_UltraEdit_freeform) [](https://huggingface.co/CaptainZZZ/sd3-instructpix2pix) | [Jiayu Zhang](https://github.com/xduzhangjiayu) and [Haozhe Zhao](https://github.com/HaozheZhao)|
|
||||
| Flux Kontext multiple images | A modified version of the `FluxKontextPipeline` that supports calling Flux Kontext with multiple reference images.| [Flux Kontext multiple input Pipeline](#flux-kontext-multiple-images) | - | [Net-Mist](https://github.com/Net-Mist) |
|
||||
|
||||
|
||||
To load a custom pipeline you just need to pass the `custom_pipeline` argument to `DiffusionPipeline`, as one of the files in `diffusers/examples/community`. Feel free to send a PR with your own pipelines, we will merge them quickly.
|
||||
|
||||
```py
|
||||
|
||||
@@ -398,7 +398,7 @@ class ComposableStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin)
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
@@ -147,7 +147,7 @@ class ImagicStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `nd.array`.
|
||||
|
||||
@@ -197,7 +197,7 @@ class ImageToImageInpaintingPipeline(DiffusionPipeline):
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
@@ -173,7 +173,7 @@ class StableDiffusionWalkPipeline(DiffusionPipeline, StableDiffusionMixin):
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
@@ -888,7 +888,7 @@ class StableDiffusionLongPromptWeightingPipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
@@ -1131,7 +1131,7 @@ class StableDiffusionLongPromptWeightingPipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -721,7 +721,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
|
||||
latents (`np.ndarray`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
|
||||
The max multiple length of prompt embeddings compared to the max output length of text encoder.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
@@ -918,7 +918,7 @@ class OnnxStableDiffusionLongPromptWeightingPipeline(OnnxStableDiffusionPipeline
|
||||
latents (`np.ndarray`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
max_embeddings_multiples (`int`, *optional*, defaults to `3`):
|
||||
The max multiple length of prompt embeddings compared to the max output length of text encoder.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
|
||||
@@ -1519,7 +1519,7 @@ class SDXLLongPromptWeightingPipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
ip_adapter_image: (`PipelineImageInput`, *optional*):
|
||||
Optional image input to work with IP Adapters.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
|
||||
@@ -187,7 +187,7 @@ class MultilingualStableDiffusion(DiffusionPipeline, StableDiffusionMixin):
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
@@ -888,7 +888,7 @@ class KolorsControlNetPipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -1066,7 +1066,7 @@ class KolorsControlNetImg2ImgPipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -1298,7 +1298,7 @@ class KolorsControlNetInpaintPipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
@@ -724,7 +724,7 @@ class DemoFusionSDXLPipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -1906,7 +1906,7 @@ class FaithDiffStableDiffusionXLPipeline(
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -730,7 +730,7 @@ class FluxDifferentialImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
||||
1)`, or `(H, W)`.
|
||||
mask_image_latent (`torch.Tensor`, `List[torch.Tensor]`):
|
||||
`Tensor` representing an image batch to mask `image` generated by VAE. If not provided, the mask
|
||||
latents tensor will ge generated by `mask_image`.
|
||||
latents tensor will be generated by `mask_image`.
|
||||
height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
||||
The height in pixels of the generated image. This is set to 1024 by default for the best results.
|
||||
width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
|
||||
@@ -769,7 +769,7 @@ class FluxDifferentialImg2ImgPipeline(DiffusionPipeline, FluxLoraLoaderMixin):
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -885,7 +885,7 @@ class FluxKontextPipeline(
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -711,7 +711,7 @@ class RFInversionFluxPipeline(
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -853,7 +853,7 @@ class FluxSemanticGuidancePipeline(
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -639,7 +639,7 @@ class FluxCFGPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixi
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -904,7 +904,7 @@ class KolorsDifferentialImg2ImgPipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -1246,7 +1246,7 @@ class KolorsInpaintPipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
@@ -611,7 +611,7 @@ class Prompt2PromptPipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
@@ -1480,7 +1480,7 @@ class StyleAlignedSDXLPipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -748,7 +748,7 @@ class StableDiffusion3DifferentialImg2ImgPipeline(DiffusionPipeline):
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -945,7 +945,7 @@ class StableDiffusion3InstructPix2PixPipeline(
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -1786,7 +1786,7 @@ class StableDiffusionXL_AE_Pipeline(
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
@@ -973,7 +973,7 @@ class StableDiffusionXLControlNetAdapterPipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -1329,7 +1329,7 @@ class StableDiffusionXLControlNetAdapterInpaintPipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -1053,7 +1053,7 @@ class StableDiffusionXLDifferentialImg2ImgPipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -832,7 +832,7 @@ class StableDiffusionXLPipelineIpex(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -632,7 +632,7 @@ class CogVideoXSTGPipeline(DiffusionPipeline, CogVideoXLoraLoaderMixin):
|
||||
latents (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.FloatTensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -620,7 +620,7 @@ class LTXSTGPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderM
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -682,7 +682,7 @@ class LTXImageToVideoSTGPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVide
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -603,7 +603,7 @@ class MochiSTGPipeline(DiffusionPipeline, Mochi1LoraLoaderMixin):
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -657,7 +657,7 @@ class Zero1to3StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -656,7 +656,7 @@ class RerenderAVideoPipeline(StableDiffusionControlNetImg2ImgPipeline):
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -591,7 +591,7 @@ class OnnxStableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline):
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -695,7 +695,7 @@ class TensorRTStableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline):
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -326,7 +326,7 @@ class StableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
@@ -122,7 +122,7 @@ class SeedResizeStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin)
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
@@ -279,7 +279,7 @@ class StableDiffusionComparisonPipeline(DiffusionPipeline, StableDiffusionMixin)
|
||||
latents (`torch.Tensor`, optional):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, optional, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
@@ -670,7 +670,7 @@ class StableDiffusionControlNetImg2ImgPipeline(DiffusionPipeline, StableDiffusio
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -810,7 +810,7 @@ class StableDiffusionControlNetInpaintPipeline(DiffusionPipeline, StableDiffusio
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -804,7 +804,7 @@ class StableDiffusionControlNetInpaintImg2ImgPipeline(DiffusionPipeline, StableD
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -179,7 +179,7 @@ class StableDiffusionControlNetReferencePipeline(StableDiffusionControlNetPipeli
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -615,7 +615,7 @@ class StableDiffusionIPEXPipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -885,7 +885,7 @@ class StableDiffusionReferencePipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -678,7 +678,7 @@ class StableDiffusionRepaintPipeline(
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -380,7 +380,7 @@ class StableDiffusionXLReferencePipeline(StableDiffusionXLPipeline):
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -180,7 +180,7 @@ class TextInpainting(DiffusionPipeline, StableDiffusionMixin):
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
@@ -231,7 +231,7 @@ class StableDiffusionTiledUpscalePipeline(StableDiffusionUpscalePipeline):
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
tile_size (`int`, *optional*):
|
||||
The size of the tiles. Too big can result in an OOM-error.
|
||||
tile_border (`int`, *optional*):
|
||||
|
||||
@@ -209,7 +209,7 @@ class WildcardStableDiffusionPipeline(DiffusionPipeline, StableDiffusionMixin):
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
output_type (`str`, *optional*, defaults to `"pil"`):
|
||||
The output format of the generate image. Choose between
|
||||
[PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
|
||||
|
||||
@@ -25,6 +25,11 @@ from os.path import abspath, dirname, join
|
||||
git_repo_path = abspath(join(dirname(dirname(dirname(__file__))), "src"))
|
||||
sys.path.insert(1, git_repo_path)
|
||||
|
||||
# Add parent directory to path so we can import from tests
|
||||
repo_root = abspath(dirname(dirname(__file__)))
|
||||
if repo_root not in sys.path:
|
||||
sys.path.insert(0, repo_root)
|
||||
|
||||
|
||||
# silence FutureWarning warnings in tests since often we can't act on them until
|
||||
# they become normal warnings - i.e. the tests still need to test the current functionality
|
||||
@@ -32,13 +37,13 @@ warnings.simplefilter(action="ignore", category=FutureWarning)
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
from diffusers.utils.testing_utils import pytest_addoption_shared
|
||||
from tests.testing_utils import pytest_addoption_shared
|
||||
|
||||
pytest_addoption_shared(parser)
|
||||
|
||||
|
||||
def pytest_terminal_summary(terminalreporter):
|
||||
from diffusers.utils.testing_utils import pytest_terminal_summary_main
|
||||
from tests.testing_utils import pytest_terminal_summary_main
|
||||
|
||||
make_reports = terminalreporter.config.getoption("--make-reports")
|
||||
if make_reports:
|
||||
|
||||
@@ -24,6 +24,8 @@ import math
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
|
||||
# Add repo root to path to import from tests
|
||||
from pathlib import Path
|
||||
|
||||
import accelerate
|
||||
@@ -54,8 +56,7 @@ from diffusers.optimization import get_scheduler
|
||||
from diffusers.training_utils import compute_density_for_timestep_sampling, compute_loss_weighting_for_sd3, free_memory
|
||||
from diffusers.utils import check_min_version, is_wandb_available, make_image_grid
|
||||
from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
|
||||
from diffusers.utils.testing_utils import backend_empty_cache
|
||||
from diffusers.utils.torch_utils import is_compiled_module
|
||||
from diffusers.utils.torch_utils import backend_empty_cache, is_compiled_module
|
||||
|
||||
|
||||
if is_wandb_available():
|
||||
|
||||
@@ -77,7 +77,7 @@ export MODEL_NAME="Qwen/Qwen-Image"
|
||||
export INSTANCE_DIR="dog"
|
||||
export OUTPUT_DIR="trained-qwenimage-lora"
|
||||
|
||||
accelerate launch train_dreambooth_lora_qwenimage.py \
|
||||
accelerate launch train_dreambooth_lora_qwen_image.py \
|
||||
--pretrained_model_name_or_path=$MODEL_NAME \
|
||||
--instance_data_dir=$INSTANCE_DIR \
|
||||
--output_dir=$OUTPUT_DIR \
|
||||
|
||||
@@ -642,6 +642,7 @@ def parse_args(input_args=None):
|
||||
],
|
||||
help="The image interpolation method to use for resizing images.",
|
||||
)
|
||||
parser.add_argument("--enable_npu_flash_attention", action="store_true", help="Enabla Flash Attention for NPU")
|
||||
|
||||
if input_args is not None:
|
||||
args = parser.parse_args(input_args)
|
||||
@@ -1182,6 +1183,13 @@ def main(args):
|
||||
text_encoder_one.requires_grad_(False)
|
||||
text_encoder_two.requires_grad_(False)
|
||||
|
||||
if args.enable_npu_flash_attention:
|
||||
if is_torch_npu_available():
|
||||
logger.info("npu flash attention enabled.")
|
||||
transformer.set_attention_backend("_native_npu")
|
||||
else:
|
||||
raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu device ")
|
||||
|
||||
# For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
|
||||
@@ -80,6 +80,7 @@ from diffusers.utils import (
|
||||
is_wandb_available,
|
||||
)
|
||||
from diffusers.utils.hub_utils import load_or_create_model_card, populate_model_card
|
||||
from diffusers.utils.import_utils import is_torch_npu_available
|
||||
from diffusers.utils.torch_utils import is_compiled_module
|
||||
|
||||
|
||||
@@ -686,6 +687,7 @@ def parse_args(input_args=None):
|
||||
),
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
parser.add_argument("--enable_npu_flash_attention", action="store_true", help="Enabla Flash Attention for NPU")
|
||||
|
||||
if input_args is not None:
|
||||
args = parser.parse_args(input_args)
|
||||
@@ -1213,6 +1215,13 @@ def main(args):
|
||||
text_encoder_one.requires_grad_(False)
|
||||
text_encoder_two.requires_grad_(False)
|
||||
|
||||
if args.enable_npu_flash_attention:
|
||||
if is_torch_npu_available():
|
||||
logger.info("npu flash attention enabled.")
|
||||
transformer.set_attention_backend("_native_npu")
|
||||
else:
|
||||
raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu device ")
|
||||
|
||||
# For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
|
||||
@@ -706,6 +706,7 @@ def parse_args(input_args=None):
|
||||
),
|
||||
)
|
||||
parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
|
||||
parser.add_argument("--enable_npu_flash_attention", action="store_true", help="Enabla Flash Attention for NPU")
|
||||
|
||||
if input_args is not None:
|
||||
args = parser.parse_args(input_args)
|
||||
@@ -1354,6 +1355,13 @@ def main(args):
|
||||
text_encoder_one.requires_grad_(False)
|
||||
text_encoder_two.requires_grad_(False)
|
||||
|
||||
if args.enable_npu_flash_attention:
|
||||
if is_torch_npu_available():
|
||||
logger.info("npu flash attention enabled.")
|
||||
transformer.set_attention_backend("_native_npu")
|
||||
else:
|
||||
raise ValueError("npu flash attention requires torch_npu extensions and is supported only on npu device ")
|
||||
|
||||
# For mixed precision training we cast all non-trainable weights (vae, text_encoder and transformer) to half-precision
|
||||
# as these weights are only used for inference, keeping weights in full precision is not required.
|
||||
weight_dtype = torch.float32
|
||||
|
||||
@@ -860,7 +860,7 @@ class PixArtAlphaControlnetPipeline(DiffusionPipeline):
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -202,7 +202,7 @@ class RDMPipeline(DiffusionPipeline, StableDiffusionMixin):
|
||||
latents (`torch.Tensor`, *optional*):
|
||||
Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
|
||||
generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
|
||||
tensor will ge generated by sampling using the supplied random `generator`.
|
||||
tensor will be generated by sampling using the supplied random `generator`.
|
||||
prompt_embeds (`torch.Tensor`, *optional*):
|
||||
Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
|
||||
provided, text embeddings will be generated from `prompt` input argument.
|
||||
|
||||
@@ -24,12 +24,18 @@ import tempfile
|
||||
import torch
|
||||
|
||||
from diffusers import VQModel
|
||||
from diffusers.utils.testing_utils import require_timm
|
||||
|
||||
|
||||
# Add parent directories to path to import from tests
|
||||
sys.path.append("..")
|
||||
repo_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../.."))
|
||||
if repo_root not in sys.path:
|
||||
sys.path.insert(0, repo_root)
|
||||
|
||||
from test_examples_utils import ExamplesTestsAccelerate, run_command # noqa: E402
|
||||
|
||||
from tests.testing_utils import require_timm # noqa
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
|
||||
|
||||
@@ -218,6 +218,8 @@ else:
|
||||
"OmniGenTransformer2DModel",
|
||||
"PixArtTransformer2DModel",
|
||||
"PriorTransformer",
|
||||
"QwenImageControlNetModel",
|
||||
"QwenImageMultiControlNetModel",
|
||||
"QwenImageTransformer2DModel",
|
||||
"SanaControlNetModel",
|
||||
"SanaTransformer2DModel",
|
||||
@@ -491,6 +493,7 @@ else:
|
||||
"PixArtAlphaPipeline",
|
||||
"PixArtSigmaPAGPipeline",
|
||||
"PixArtSigmaPipeline",
|
||||
"QwenImageControlNetPipeline",
|
||||
"QwenImageEditPipeline",
|
||||
"QwenImageImg2ImgPipeline",
|
||||
"QwenImageInpaintPipeline",
|
||||
@@ -885,6 +888,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
OmniGenTransformer2DModel,
|
||||
PixArtTransformer2DModel,
|
||||
PriorTransformer,
|
||||
QwenImageControlNetModel,
|
||||
QwenImageMultiControlNetModel,
|
||||
QwenImageTransformer2DModel,
|
||||
SanaControlNetModel,
|
||||
SanaTransformer2DModel,
|
||||
@@ -1128,6 +1133,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
PixArtAlphaPipeline,
|
||||
PixArtSigmaPAGPipeline,
|
||||
PixArtSigmaPipeline,
|
||||
QwenImageControlNetPipeline,
|
||||
QwenImageEditPipeline,
|
||||
QwenImageImg2ImgPipeline,
|
||||
QwenImageInpaintPipeline,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user