Compare commits

..

3 Commits

Author SHA1 Message Date
DN6 bffa3a9754 update 2025-11-14 15:48:19 +05:30
DN6 1c558712e8 Merge branch 'main' into model-test-refactor 2025-11-12 10:18:07 +05:30
DN6 1f026ad14e update 2025-11-12 10:17:54 +05:30
117 changed files with 4490 additions and 7145 deletions
+7 -7
View File
@@ -84,7 +84,7 @@ jobs:
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-k "not Flax and not Onnx" \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
--report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
tests/pipelines/${{ matrix.module }}
@@ -138,7 +138,7 @@ jobs:
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-k "not Flax and not Onnx" \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_torch_${{ matrix.module }}_cuda \
--report-log=tests_torch_${{ matrix.module }}_cuda.log \
tests/${{ matrix.module }}
@@ -151,7 +151,7 @@ jobs:
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
--make-reports=examples_torch_cuda \
-s -v --make-reports=examples_torch_cuda \
--report-log=examples_torch_cuda.log \
examples/
@@ -198,7 +198,7 @@ jobs:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
RUN_COMPILE: yes
run: |
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "compile" --make-reports=tests_torch_compile_cuda tests/
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
- name: Failure short reports
if: ${{ failure() }}
run: cat reports/tests_torch_compile_cuda_failures_short.txt
@@ -293,7 +293,7 @@ jobs:
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-k "not Flax and not Onnx" \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_torch_minimum_version_cuda \
tests/models/test_modeling_common.py \
tests/pipelines/test_pipelines_common.py \
@@ -531,7 +531,7 @@ jobs:
# HF_HOME: /System/Volumes/Data/mnt/cache
# HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
# run: |
# ${CONDA_RUN} pytest -n 1 --make-reports=tests_torch_mps \
# ${CONDA_RUN} pytest -n 1 -s -v --make-reports=tests_torch_mps \
# --report-log=tests_torch_mps.log \
# tests/
# - name: Failure short reports
@@ -587,7 +587,7 @@ jobs:
# HF_HOME: /System/Volumes/Data/mnt/cache
# HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
# run: |
# ${CONDA_RUN} pytest -n 1 --make-reports=tests_torch_mps \
# ${CONDA_RUN} pytest -n 1 -s -v --make-reports=tests_torch_mps \
# --report-log=tests_torch_mps.log \
# tests/
# - name: Failure short reports
+1 -1
View File
@@ -120,7 +120,7 @@ jobs:
if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
run: |
pytest -n 8 --max-worker-restart=0 --dist=loadfile \
-k "not Flax and not Onnx" \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_${{ matrix.config.report }} \
tests/modular_pipelines
+4 -4
View File
@@ -126,7 +126,7 @@ jobs:
if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
run: |
pytest -n 8 --max-worker-restart=0 --dist=loadfile \
-k "not Flax and not Onnx" \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_${{ matrix.config.report }} \
tests/pipelines
@@ -134,7 +134,7 @@ jobs:
if: ${{ matrix.config.framework == 'pytorch_models' }}
run: |
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
-k "not Flax and not Onnx and not Dependency" \
-s -v -k "not Flax and not Onnx and not Dependency" \
--make-reports=tests_${{ matrix.config.report }} \
tests/models tests/schedulers tests/others
@@ -255,11 +255,11 @@ jobs:
- name: Run fast PyTorch LoRA tests with PEFT
run: |
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
\
-s -v \
--make-reports=tests_peft_main \
tests/lora/
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
\
-s -v \
--make-reports=tests_models_lora_peft_main \
tests/models/ -k "lora"
+5 -5
View File
@@ -151,13 +151,13 @@ jobs:
run: |
if [ "${{ matrix.module }}" = "ip_adapters" ]; then
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-k "not Flax and not Onnx" \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
tests/pipelines/${{ matrix.module }}
else
pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-k "not Flax and not Onnx and $pattern" \
-s -v -k "not Flax and not Onnx and $pattern" \
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
tests/pipelines/${{ matrix.module }}
fi
@@ -222,10 +222,10 @@ jobs:
run: |
pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
if [ -z "$pattern" ]; then
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
--make-reports=tests_torch_cuda_${{ matrix.module }}
else
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
--make-reports=tests_torch_cuda_${{ matrix.module }}
fi
@@ -274,7 +274,7 @@ jobs:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
run: |
uv pip install ".[training]"
pytest -n 1 --max-worker-restart=0 --dist=loadfile --make-reports=examples_torch_cuda examples/
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
- name: Failure short reports
if: ${{ failure() }}
+5 -5
View File
@@ -87,7 +87,7 @@ jobs:
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-k "not Flax and not Onnx" \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
tests/pipelines/${{ matrix.module }}
- name: Failure short reports
@@ -141,7 +141,7 @@ jobs:
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-k "not Flax and not Onnx" \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_torch_cuda_${{ matrix.module }} \
tests/${{ matrix.module }}
@@ -189,7 +189,7 @@ jobs:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
RUN_COMPILE: yes
run: |
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "compile" --make-reports=tests_torch_compile_cuda tests/
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
- name: Failure short reports
if: ${{ failure() }}
run: cat reports/tests_torch_compile_cuda_failures_short.txt
@@ -230,7 +230,7 @@ jobs:
env:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
run: |
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
- name: Failure short reports
if: ${{ failure() }}
run: cat reports/tests_torch_xformers_cuda_failures_short.txt
@@ -273,7 +273,7 @@ jobs:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
run: |
uv pip install ".[training]"
pytest -n 1 --max-worker-restart=0 --dist=loadfile --make-reports=examples_torch_cuda examples/
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
- name: Failure short reports
if: ${{ failure() }}
+1 -1
View File
@@ -70,7 +70,7 @@ jobs:
if: ${{ matrix.config.framework == 'pytorch' }}
run: |
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
-k "not Flax and not Onnx" \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_${{ matrix.config.report }} \
tests/
+1 -1
View File
@@ -57,7 +57,7 @@ jobs:
HF_HOME: /System/Volumes/Data/mnt/cache
HF_TOKEN: ${{ secrets.HF_TOKEN }}
run: |
${CONDA_RUN} python -m pytest -n 0 --make-reports=tests_torch_mps tests/
${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/
- name: Failure short reports
if: ${{ failure() }}
+6 -6
View File
@@ -84,7 +84,7 @@ jobs:
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-k "not Flax and not Onnx" \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
tests/pipelines/${{ matrix.module }}
- name: Failure short reports
@@ -137,7 +137,7 @@ jobs:
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-k "not Flax and not Onnx" \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_torch_${{ matrix.module }}_cuda \
tests/${{ matrix.module }}
@@ -187,7 +187,7 @@ jobs:
CUBLAS_WORKSPACE_CONFIG: :16:8
run: |
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
-k "not Flax and not Onnx" \
-s -v -k "not Flax and not Onnx" \
--make-reports=tests_torch_minimum_cuda \
tests/models/test_modeling_common.py \
tests/pipelines/test_pipelines_common.py \
@@ -240,7 +240,7 @@ jobs:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
RUN_COMPILE: yes
run: |
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "compile" --make-reports=tests_torch_compile_cuda tests/
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
- name: Failure short reports
if: ${{ failure() }}
run: cat reports/tests_torch_compile_cuda_failures_short.txt
@@ -281,7 +281,7 @@ jobs:
env:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
run: |
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
- name: Failure short reports
if: ${{ failure() }}
run: cat reports/tests_torch_xformers_cuda_failures_short.txt
@@ -326,7 +326,7 @@ jobs:
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
run: |
uv pip install ".[training]"
pytest -n 1 --max-worker-restart=0 --dist=loadfile --make-reports=examples_torch_cuda examples/
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
- name: Failure short reports
if: ${{ failure() }}
+4 -10
View File
@@ -22,8 +22,6 @@
title: Reproducibility
- local: using-diffusers/schedulers
title: Schedulers
- local: using-diffusers/automodel
title: AutoModel
- local: using-diffusers/other-formats
title: Model formats
- local: using-diffusers/push_to_hub
@@ -121,8 +119,6 @@
title: ComponentsManager
- local: modular_diffusers/guiders
title: Guiders
- local: modular_diffusers/custom_blocks
title: Building Custom Blocks
title: Modular Diffusers
- isExpanded: false
sections:
@@ -391,8 +387,6 @@
title: Transformer2DModel
- local: api/models/transformer_temporal
title: TransformerTemporalModel
- local: api/models/wan_animate_transformer_3d
title: WanAnimateTransformer3DModel
- local: api/models/wan_transformer_3d
title: WanTransformer3DModel
title: Transformers
@@ -454,8 +448,6 @@
- sections:
- local: api/pipelines/overview
title: Overview
- local: api/pipelines/auto_pipeline
title: AutoPipeline
- sections:
- local: api/pipelines/audioldm
title: AudioLDM
@@ -468,6 +460,8 @@
- local: api/pipelines/stable_audio
title: Stable Audio
title: Audio
- local: api/pipelines/auto_pipeline
title: AutoPipeline
- sections:
- local: api/pipelines/amused
title: aMUSEd
@@ -531,8 +525,6 @@
title: HiDream-I1
- local: api/pipelines/hunyuandit
title: Hunyuan-DiT
- local: api/pipelines/hunyuanimage21
title: HunyuanImage2.1
- local: api/pipelines/pix2pix
title: InstructPix2Pix
- local: api/pipelines/kandinsky
@@ -646,6 +638,8 @@
title: ConsisID
- local: api/pipelines/framepack
title: Framepack
- local: api/pipelines/hunyuanimage21
title: HunyuanImage2.1
- local: api/pipelines/hunyuan_video
title: HunyuanVideo
- local: api/pipelines/i2vgenxl
+9 -1
View File
@@ -12,7 +12,15 @@ specific language governing permissions and limitations under the License.
# AutoModel
[`AutoModel`] automatically retrieves the correct model class from the checkpoint `config.json` file.
The `AutoModel` is designed to make it easy to load a checkpoint without needing to know the specific model class. `AutoModel` automatically retrieves the correct model class from the checkpoint `config.json` file.
```python
from diffusers import AutoModel, AutoPipelineForText2Image
unet = AutoModel.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="unet")
pipe = AutoPipelineForText2Image.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", unet=unet)
```
## AutoModel
@@ -1,30 +0,0 @@
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License. -->
# WanAnimateTransformer3DModel
A Diffusion Transformer model for 3D video-like data was introduced in [Wan Animate](https://github.com/Wan-Video/Wan2.2) by the Alibaba Wan Team.
The model can be loaded with the following code snippet.
```python
from diffusers import WanAnimateTransformer3DModel
transformer = WanAnimateTransformer3DModel.from_pretrained("Wan-AI/Wan2.2-Animate-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
```
## WanAnimateTransformer3DModel
[[autodoc]] WanAnimateTransformer3DModel
## Transformer2DModelOutput
[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
+17 -226
View File
@@ -40,7 +40,6 @@ The following Wan models are supported in Diffusers:
- [Wan 2.2 T2V 14B](https://huggingface.co/Wan-AI/Wan2.2-T2V-A14B-Diffusers)
- [Wan 2.2 I2V 14B](https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers)
- [Wan 2.2 TI2V 5B](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B-Diffusers)
- [Wan 2.2 Animate 14B](https://huggingface.co/Wan-AI/Wan2.2-Animate-14B-Diffusers)
> [!TIP]
> Click on the Wan models in the right sidebar for more examples of video generation.
@@ -96,15 +95,15 @@ pipeline = WanPipeline.from_pretrained(
pipeline.to("cuda")
prompt = """
The camera rushes from far to near in a low-angle shot,
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
The camera rushes from far to near in a low-angle shot,
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
"""
negative_prompt = """
Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality,
low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured,
Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality,
low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured,
misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards
"""
@@ -151,15 +150,15 @@ pipeline.transformer = torch.compile(
)
prompt = """
The camera rushes from far to near in a low-angle shot,
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
The camera rushes from far to near in a low-angle shot,
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
"""
negative_prompt = """
Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality,
low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured,
Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality,
low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured,
misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards
"""
@@ -250,208 +249,6 @@ The code snippets available in [this](https://github.com/huggingface/diffusers/p
The general rule of thumb to keep in mind when preparing inputs for the VACE pipeline is that the input images, or frames of a video that you want to use for conditioning, should have a corresponding mask that is black in color. The black mask signifies that the model will not generate new content for that area, and only use those parts for conditioning the generation process. For parts/frames that should be generated by the model, the mask should be white in color.
</hfoption>
</hfoptions>
### Wan-Animate: Unified Character Animation and Replacement with Holistic Replication
[Wan-Animate](https://huggingface.co/papers/2509.14055) by the Wan Team.
*We introduce Wan-Animate, a unified framework for character animation and replacement. Given a character image and a reference video, Wan-Animate can animate the character by precisely replicating the expressions and movements of the character in the video to generate high-fidelity character videos. Alternatively, it can integrate the animated character into the reference video to replace the original character, replicating the scene's lighting and color tone to achieve seamless environmental integration. Wan-Animate is built upon the Wan model. To adapt it for character animation tasks, we employ a modified input paradigm to differentiate between reference conditions and regions for generation. This design unifies multiple tasks into a common symbolic representation. We use spatially-aligned skeleton signals to replicate body motion and implicit facial features extracted from source images to reenact expressions, enabling the generation of character videos with high controllability and expressiveness. Furthermore, to enhance environmental integration during character replacement, we develop an auxiliary Relighting LoRA. This module preserves the character's appearance consistency while applying the appropriate environmental lighting and color tone. Experimental results demonstrate that Wan-Animate achieves state-of-the-art performance. We are committed to open-sourcing the model weights and its source code.*
The project page: https://humanaigc.github.io/wan-animate
This model was mostly contributed by [M. Tolga Cangöz](https://github.com/tolgacangoz).
#### Usage
The Wan-Animate pipeline supports two modes of operation:
1. **Animation Mode** (default): Animates a character image based on motion and expression from reference videos
2. **Replacement Mode**: Replaces a character in a background video with a new character while preserving the scene
##### Prerequisites
Before using the pipeline, you need to preprocess your reference video to extract:
- **Pose video**: Contains skeletal keypoints representing body motion
- **Face video**: Contains facial feature representations for expression control
For replacement mode, you additionally need:
- **Background video**: The original video containing the scene
- **Mask video**: A mask indicating where to generate content (white) vs. preserve original (black)
> [!NOTE]
> Raw videos should not be used for inputs such as `pose_video`, which the pipeline expects to be preprocessed to extract the proper information. Preprocessing scripts to prepare these inputs are available in the [original Wan-Animate repository](https://github.com/Wan-Video/Wan2.2?tab=readme-ov-file#1-preprocessing). Integration of these preprocessing steps into Diffusers is planned for a future release.
The example below demonstrates how to use the Wan-Animate pipeline:
<hfoptions id="Animate usage">
<hfoption id="Animation mode">
```python
import numpy as np
import torch
from diffusers import AutoencoderKLWan, WanAnimatePipeline
from diffusers.utils import export_to_video, load_image, load_video
model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
pipe.to("cuda")
# Load character image and preprocessed videos
image = load_image("path/to/character.jpg")
pose_video = load_video("path/to/pose_video.mp4") # Preprocessed skeletal keypoints
face_video = load_video("path/to/face_video.mp4") # Preprocessed facial features
# Resize image to match VAE constraints
def aspect_ratio_resize(image, pipe, max_area=720 * 1280):
aspect_ratio = image.height / image.width
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
image = image.resize((width, height))
return image, height, width
image, height, width = aspect_ratio_resize(image, pipe)
prompt = "A person dancing energetically in a studio with dynamic lighting and professional camera work"
negative_prompt = "blurry, low quality, distorted, deformed, static, poorly drawn"
# Generate animated video
output = pipe(
image=image,
pose_video=pose_video,
face_video=face_video,
prompt=prompt,
negative_prompt=negative_prompt,
height=height,
width=width,
segment_frame_length=77,
guidance_scale=1.0,
mode="animate", # Animation mode (default)
).frames[0]
export_to_video(output, "animated_character.mp4", fps=30)
```
</hfoption>
<hfoption id="Replacement mode">
```python
import numpy as np
import torch
from diffusers import AutoencoderKLWan, WanAnimatePipeline
from diffusers.utils import export_to_video, load_image, load_video
model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
pipe.to("cuda")
# Load all required inputs for replacement mode
image = load_image("path/to/new_character.jpg")
pose_video = load_video("path/to/pose_video.mp4") # Preprocessed skeletal keypoints
face_video = load_video("path/to/face_video.mp4") # Preprocessed facial features
background_video = load_video("path/to/background_video.mp4") # Original scene
mask_video = load_video("path/to/mask_video.mp4") # Black: preserve, White: generate
# Resize image to match video dimensions
def aspect_ratio_resize(image, pipe, max_area=720 * 1280):
aspect_ratio = image.height / image.width
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
image = image.resize((width, height))
return image, height, width
image, height, width = aspect_ratio_resize(image, pipe)
prompt = "A person seamlessly integrated into the scene with consistent lighting and environment"
negative_prompt = "blurry, low quality, inconsistent lighting, floating, disconnected from scene"
# Replace character in background video
output = pipe(
image=image,
pose_video=pose_video,
face_video=face_video,
background_video=background_video,
mask_video=mask_video,
prompt=prompt,
negative_prompt=negative_prompt,
height=height,
width=width,
segment_frame_lengths=77,
guidance_scale=1.0,
mode="replace", # Replacement mode
).frames[0]
export_to_video(output, "character_replaced.mp4", fps=30)
```
</hfoption>
<hfoption id="Advanced options">
```python
import numpy as np
import torch
from diffusers import AutoencoderKLWan, WanAnimatePipeline
from diffusers.utils import export_to_video, load_image, load_video
model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
pipe.to("cuda")
image = load_image("path/to/character.jpg")
pose_video = load_video("path/to/pose_video.mp4")
face_video = load_video("path/to/face_video.mp4")
def aspect_ratio_resize(image, pipe, max_area=720 * 1280):
aspect_ratio = image.height / image.width
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
image = image.resize((width, height))
return image, height, width
image, height, width = aspect_ratio_resize(image, pipe)
prompt = "A person dancing energetically in a studio"
negative_prompt = "blurry, low quality"
# Advanced: Use temporal guidance and custom callback
def callback_fn(pipe, step_index, timestep, callback_kwargs):
# You can modify latents or other tensors here
print(f"Step {step_index}, Timestep {timestep}")
return callback_kwargs
output = pipe(
image=image,
pose_video=pose_video,
face_video=face_video,
prompt=prompt,
negative_prompt=negative_prompt,
height=height,
width=width,
segment_frame_length=77,
num_inference_steps=50,
guidance_scale=5.0,
prev_segment_conditioning_frames=5, # Use 5 frames for temporal guidance (1 or 5 recommended)
callback_on_step_end=callback_fn,
callback_on_step_end_tensor_inputs=["latents"],
).frames[0]
export_to_video(output, "animated_advanced.mp4", fps=30)
```
</hfoption>
</hfoptions>
#### Key Parameters
- **mode**: Choose between `"animate"` (default) or `"replace"`
- **prev_segment_conditioning_frames**: Number of frames for temporal guidance (1 or 5 recommended). Using 5 provides better temporal consistency but requires more memory
- **guidance_scale**: Controls how closely the output follows the text prompt. Higher values (5-7) produce results more aligned with the prompt. For Wan-Animate, CFG is disabled by default (`guidance_scale=1.0`) but can be enabled to support negative prompts and finer control over facial expressions. (Note that CFG will only target the text prompt and face conditioning.)
## Notes
- Wan2.1 supports LoRAs with [`~loaders.WanLoraLoaderMixin.load_lora_weights`].
@@ -484,10 +281,10 @@ export_to_video(output, "animated_advanced.mp4", fps=30)
# use "steamboat willie style" to trigger the LoRA
prompt = """
steamboat willie style, golden era animation, The camera rushes from far to near in a low-angle shot,
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
steamboat willie style, golden era animation, The camera rushes from far to near in a low-angle shot,
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
"""
@@ -562,12 +359,6 @@ export_to_video(output, "animated_advanced.mp4", fps=30)
- all
- __call__
## WanAnimatePipeline
[[autodoc]] WanAnimatePipeline
- all
- __call__
## WanPipelineOutput
[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput
[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput
@@ -1,492 +0,0 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
# Building Custom Blocks
[ModularPipelineBlocks](./pipeline_block) are the fundamental building blocks of a [`ModularPipeline`]. You can create custom blocks by defining their inputs, outputs, and computation logic. This guide demonstrates how to create and use a custom block.
> [!TIP]
> Explore the [Modular Diffusers Custom Blocks](https://huggingface.co/collections/diffusers/modular-diffusers-custom-blocks) collection for official custom modular blocks like Nano Banana.
## Project Structure
Your custom block project should use the following structure:
```shell
.
├── block.py
└── modular_config.json
```
- `block.py` contains the custom block implementation
- `modular_config.json` contains the metadata needed to load the block
## Example: Florence 2 Inpainting Block
In this example we will create a custom block that uses the [Florence 2](https://huggingface.co/docs/transformers/model_doc/florence2) model to process an input image and generate a mask for inpainting.
The first step is to define the components that the block will use. In this case, we will need to use the `Florence2ForConditionalGeneration` model and its corresponding processor `AutoProcessor`. When defining components, we must specify the name of the component within our pipeline, model class via `type_hint`, and provide a `pretrained_model_name_or_path` for the component if we intend to load the model weights from a specific repository on the Hub.
```py
# Inside block.py
from diffusers.modular_pipelines import (
ModularPipelineBlocks,
ComponentSpec,
)
from transformers import AutoProcessor, Florence2ForConditionalGeneration
class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
@property
def expected_components(self):
return [
ComponentSpec(
name="image_annotator",
type_hint=Florence2ForConditionalGeneration,
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
),
ComponentSpec(
name="image_annotator_processor",
type_hint=AutoProcessor,
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
),
]
```
Next, we define the inputs and outputs of the block. The inputs include the image to be annotated, the annotation task, and the annotation prompt. The outputs include the generated mask image and annotations.
```py
from typing import List, Union
from PIL import Image, ImageDraw
import torch
import numpy as np
from diffusers.modular_pipelines import (
PipelineState,
ModularPipelineBlocks,
InputParam,
ComponentSpec,
OutputParam,
)
from transformers import AutoProcessor, Florence2ForConditionalGeneration
class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
@property
def expected_components(self):
return [
ComponentSpec(
name="image_annotator",
type_hint=Florence2ForConditionalGeneration,
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
),
ComponentSpec(
name="image_annotator_processor",
type_hint=AutoProcessor,
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(
"image",
type_hint=Union[Image.Image, List[Image.Image]],
required=True,
description="Image(s) to annotate",
),
InputParam(
"annotation_task",
type_hint=Union[str, List[str]],
required=True,
default="<REFERRING_EXPRESSION_SEGMENTATION>",
description="""Annotation Task to perform on the image.
Supported Tasks:
<OD>
<REFERRING_EXPRESSION_SEGMENTATION>
<CAPTION>
<DETAILED_CAPTION>
<MORE_DETAILED_CAPTION>
<DENSE_REGION_CAPTION>
<CAPTION_TO_PHRASE_GROUNDING>
<OPEN_VOCABULARY_DETECTION>
""",
),
InputParam(
"annotation_prompt",
type_hint=Union[str, List[str]],
required=True,
description="""Annotation Prompt to provide more context to the task.
Can be used to detect or segment out specific elements in the image
""",
),
InputParam(
"annotation_output_type",
type_hint=str,
required=True,
default="mask_image",
description="""Output type from annotation predictions. Availabe options are
mask_image:
-black and white mask image for the given image based on the task type
mask_overlay:
- mask overlayed on the original image
bounding_box:
- bounding boxes drawn on the original image
""",
),
InputParam(
"annotation_overlay",
type_hint=bool,
required=True,
default=False,
description="",
),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
"mask_image",
type_hint=Image,
description="Inpainting Mask for input Image(s)",
),
OutputParam(
"annotations",
type_hint=dict,
description="Annotations Predictions for input Image(s)",
),
OutputParam(
"image",
type_hint=Image,
description="Annotated input Image(s)",
),
]
```
Now we implement the `__call__` method, which contains the logic for processing the input image and generating the mask.
```py
from typing import List, Union
from PIL import Image, ImageDraw
import torch
import numpy as np
from diffusers.modular_pipelines import (
PipelineState,
ModularPipelineBlocks,
InputParam,
ComponentSpec,
OutputParam,
)
from transformers import AutoProcessor, Florence2ForConditionalGeneration
class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
@property
def expected_components(self):
return [
ComponentSpec(
name="image_annotator",
type_hint=Florence2ForConditionalGeneration,
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
),
ComponentSpec(
name="image_annotator_processor",
type_hint=AutoProcessor,
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
),
]
@property
def inputs(self) -> List[InputParam]:
return [
InputParam(
"image",
type_hint=Union[Image.Image, List[Image.Image]],
required=True,
description="Image(s) to annotate",
),
InputParam(
"annotation_task",
type_hint=Union[str, List[str]],
required=True,
default="<REFERRING_EXPRESSION_SEGMENTATION>",
description="""Annotation Task to perform on the image.
Supported Tasks:
<OD>
<REFERRING_EXPRESSION_SEGMENTATION>
<CAPTION>
<DETAILED_CAPTION>
<MORE_DETAILED_CAPTION>
<DENSE_REGION_CAPTION>
<CAPTION_TO_PHRASE_GROUNDING>
<OPEN_VOCABULARY_DETECTION>
""",
),
InputParam(
"annotation_prompt",
type_hint=Union[str, List[str]],
required=True,
description="""Annotation Prompt to provide more context to the task.
Can be used to detect or segment out specific elements in the image
""",
),
InputParam(
"annotation_output_type",
type_hint=str,
required=True,
default="mask_image",
description="""Output type from annotation predictions. Availabe options are
mask_image:
-black and white mask image for the given image based on the task type
mask_overlay:
- mask overlayed on the original image
bounding_box:
- bounding boxes drawn on the original image
""",
),
InputParam(
"annotation_overlay",
type_hint=bool,
required=True,
default=False,
description="",
),
]
@property
def intermediate_outputs(self) -> List[OutputParam]:
return [
OutputParam(
"mask_image",
type_hint=Image,
description="Inpainting Mask for input Image(s)",
),
OutputParam(
"annotations",
type_hint=dict,
description="Annotations Predictions for input Image(s)",
),
OutputParam(
"image",
type_hint=Image,
description="Annotated input Image(s)",
),
]
def get_annotations(self, components, images, prompts, task):
task_prompts = [task + prompt for prompt in prompts]
inputs = components.image_annotator_processor(
text=task_prompts, images=images, return_tensors="pt"
).to(components.image_annotator.device, components.image_annotator.dtype)
generated_ids = components.image_annotator.generate(
input_ids=inputs["input_ids"],
pixel_values=inputs["pixel_values"],
max_new_tokens=1024,
early_stopping=False,
do_sample=False,
num_beams=3,
)
annotations = components.image_annotator_processor.batch_decode(
generated_ids, skip_special_tokens=False
)
outputs = []
for image, annotation in zip(images, annotations):
outputs.append(
components.image_annotator_processor.post_process_generation(
annotation, task=task, image_size=(image.width, image.height)
)
)
return outputs
def prepare_mask(self, images, annotations, overlay=False, fill="white"):
masks = []
for image, annotation in zip(images, annotations):
mask_image = image.copy() if overlay else Image.new("L", image.size, 0)
draw = ImageDraw.Draw(mask_image)
for _, _annotation in annotation.items():
if "polygons" in _annotation:
for polygon in _annotation["polygons"]:
polygon = np.array(polygon).reshape(-1, 2)
if len(polygon) < 3:
continue
polygon = polygon.reshape(-1).tolist()
draw.polygon(polygon, fill=fill)
elif "bbox" in _annotation:
bbox = _annotation["bbox"]
draw.rectangle(bbox, fill="white")
masks.append(mask_image)
return masks
def prepare_bounding_boxes(self, images, annotations):
outputs = []
for image, annotation in zip(images, annotations):
image_copy = image.copy()
draw = ImageDraw.Draw(image_copy)
for _, _annotation in annotation.items():
bbox = _annotation["bbox"]
label = _annotation["label"]
draw.rectangle(bbox, outline="red", width=3)
draw.text((bbox[0], bbox[1] - 20), label, fill="red")
outputs.append(image_copy)
return outputs
def prepare_inputs(self, images, prompts):
prompts = prompts or ""
if isinstance(images, Image.Image):
images = [images]
if isinstance(prompts, str):
prompts = [prompts]
if len(images) != len(prompts):
raise ValueError("Number of images and annotation prompts must match.")
return images, prompts
@torch.no_grad()
def __call__(self, components, state: PipelineState) -> PipelineState:
block_state = self.get_block_state(state)
images, annotation_task_prompt = self.prepare_inputs(
block_state.image, block_state.annotation_prompt
)
task = block_state.annotation_task
fill = block_state.fill
annotations = self.get_annotations(
components, images, annotation_task_prompt, task
)
block_state.annotations = annotations
if block_state.annotation_output_type == "mask_image":
block_state.mask_image = self.prepare_mask(images, annotations)
else:
block_state.mask_image = None
if block_state.annotation_output_type == "mask_overlay":
block_state.image = self.prepare_mask(images, annotations, overlay=True, fill=fill)
elif block_state.annotation_output_type == "bounding_box":
block_state.image = self.prepare_bounding_boxes(images, annotations)
self.set_block_state(state, block_state)
return components, state
```
Once we have defined our custom block, we can save it to the Hub, using either the CLI or the [`push_to_hub`] method. This will make it easy to share and reuse our custom block with other pipelines.
<hfoptions id="share">
<hfoption id="hf CLI">
```shell
# In the folder with the `block.py` file, run:
diffusers-cli custom_block
```
Then upload the block to the Hub:
```shell
hf upload <your repo id> . .
```
</hfoption>
<hfoption id="push_to_hub">
```py
from block import Florence2ImageAnnotatorBlock
block = Florence2ImageAnnotatorBlock()
block.push_to_hub("<your repo id>")
```
</hfoption>
</hfoptions>
## Using Custom Blocks
Load the custom block with [`~ModularPipelineBlocks.from_pretrained`] and set `trust_remote_code=True`.
```py
import torch
from diffusers.modular_pipelines import ModularPipelineBlocks, SequentialPipelineBlocks
from diffusers.modular_pipelines.stable_diffusion_xl import INPAINT_BLOCKS
from diffusers.utils import load_image
# Fetch the Florence2 image annotator block that will create our mask
image_annotator_block = ModularPipelineBlocks.from_pretrained("diffusers/florence-2-custom-block", trust_remote_code=True)
my_blocks = INPAINT_BLOCKS.copy()
# insert the annotation block before the image encoding step
my_blocks.insert("image_annotator", image_annotator_block, 1)
# Create our initial set of inpainting blocks
blocks = SequentialPipelineBlocks.from_blocks_dict(my_blocks)
repo_id = "diffusers/modular-stable-diffusion-xl-base-1.0"
pipe = blocks.init_pipeline(repo_id)
pipe.load_components(torch_dtype=torch.float16, device_map="cuda", trust_remote_code=True)
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true")
image = image.resize((1024, 1024))
prompt = ["A red car"]
annotation_task = "<REFERRING_EXPRESSION_SEGMENTATION>"
annotation_prompt = ["the car"]
output = pipe(
prompt=prompt,
image=image,
annotation_task=annotation_task,
annotation_prompt=annotation_prompt,
annotation_output_type="mask_image",
num_inference_steps=35,
guidance_scale=7.5,
strength=0.95,
output="images"
)
output[0].save("florence-inpainting.png")
```
## Editing Custom Blocks
By default, custom blocks are saved in your cache directory. Use the `local_dir` argument to download and edit a custom block in a specific folder.
```py
import torch
from diffusers.modular_pipelines import ModularPipelineBlocks, SequentialPipelineBlocks
from diffusers.modular_pipelines.stable_diffusion_xl import INPAINT_BLOCKS
from diffusers.utils import load_image
# Fetch the Florence2 image annotator block that will create our mask
image_annotator_block = ModularPipelineBlocks.from_pretrained("diffusers/florence-2-custom-block", trust_remote_code=True, local_dir="/my-local-folder")
```
Any changes made to the block files in this folder will be reflected when you load the block again.
@@ -1,46 +0,0 @@
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
specific language governing permissions and limitations under the License.
-->
# AutoModel
The [`AutoModel`] class automatically detects and loads the correct model class (UNet, transformer, VAE) from a `config.json` file. You don't need to know the specific model class name ahead of time. It supports data types and device placement, and works across model types and libraries.
The example below loads a transformer from Diffusers and a text encoder from Transformers. Use the `subfolder` parameter to specify where to load the `config.json` file from.
```py
import torch
from diffusers import AutoModel, DiffusionPipeline
transformer = AutoModel.from_pretrained(
"Qwen/Qwen-Image", subfolder="transformer", torch_dtype=torch.bfloat16, device_map="cuda"
)
text_encoder = AutoModel.from_pretrained(
"Qwen/Qwen-Image", subfolder="text_encoder", torch_dtype=torch.bfloat16, device_map="cuda"
)
```
[`AutoModel`] also loads models from the [Hub](https://huggingface.co/models) that aren't included in Diffusers. Set `trust_remote_code=True` in [`AutoModel.from_pretrained`] to load custom models.
```py
import torch
from diffusers import AutoModel
transformer = AutoModel.from_pretrained(
"custom/custom-transformer-model", trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="cuda"
)
```
If the custom model inherits from the [`ModelMixin`] class, it gets access to the same features as Diffusers model classes, like [regional compilation](../optimization/fp16#regional-compilation) and [group offloading](../optimization/memory#group-offloading).
> [!NOTE]
> Learn more about implementing custom models in the [Community components](../using-diffusers/custom_pipeline_overview#community-components) guide.
+1 -1
View File
@@ -5488,7 +5488,7 @@ Editing at Scale", many thanks to their contribution!
This implementation of Flux Kontext allows users to pass multiple reference images. Each image is encoded separately, and the resulting latent vectors are concatenated.
As explained in Section 3 of [the paper](https://huggingface.co/papers/2506.15742), the model's sequence concatenation mechanism can extend its capabilities to handle multiple reference images. However, note that the current version of Flux Kontext was not trained for this use case. In practice, stacking along the first axis does not yield correct results, while stacking along the other two axes appears to work.
As explained in Section 3 of [the paper](https://arxiv.org/pdf/2506.15742), the model's sequence concatenation mechanism can extend its capabilities to handle multiple reference images. However, note that the current version of Flux Kontext was not trained for this use case. In practice, stacking along the first axis does not yield correct results, while stacking along the other two axes appears to work.
## Example Usage
@@ -490,7 +490,7 @@ class RegionalPromptingStableDiffusionPipeline(
def prepare_extra_step_kwargs(self, generator, eta):
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
# eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
# and should be between [0, 1]
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
@@ -841,7 +841,7 @@ class RegionalPromptingStableDiffusionPipeline(
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
eta (`float`, *optional*, defaults to 0.0):
Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only applies
Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
@@ -872,7 +872,7 @@ class RegionalPromptingStableDiffusionPipeline(
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
guidance_rescale (`float`, *optional*, defaults to 0.0):
Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
using zero terminal SNR.
clip_skip (`int`, *optional*):
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
@@ -1062,7 +1062,7 @@ class RegionalPromptingStableDiffusionPipeline(
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
# Based on 3.4. in https://huggingface.co/papers/2305.08891
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
# compute the previous noisy sample x_t -> x_t-1
@@ -1668,7 +1668,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
r"""
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
Flawed](https://huggingface.co/papers/2305.08891).
Flawed](https://arxiv.org/pdf/2305.08891.pdf).
Args:
noise_cfg (`torch.Tensor`):
+2 -1
View File
@@ -268,11 +268,12 @@ provide a simple script for LoRA fine-tuning Kontext in [train_dreambooth_lora_f
**important**
> [!NOTE]
> To make sure you can successfully run the latest version of the kontext example script, we highly recommend installing from source.
> To make sure you can successfully run the latest version of the kontext example script, we highly recommend installing from source, specifically from the commit mentioned below.
> To do this, execute the following steps in a new virtual environment:
> ```
> git clone https://github.com/huggingface/diffusers
> cd diffusers
> git checkout 05e7a854d0a5661f5b433f6dd5954c224b104f0b
> pip install -e .
> ```
+6 -265
View File
@@ -6,20 +6,11 @@ import torch
from accelerate import init_empty_weights
from huggingface_hub import hf_hub_download, snapshot_download
from safetensors.torch import load_file
from transformers import (
AutoProcessor,
AutoTokenizer,
CLIPImageProcessor,
CLIPVisionModel,
CLIPVisionModelWithProjection,
UMT5EncoderModel,
)
from transformers import AutoProcessor, AutoTokenizer, CLIPVisionModelWithProjection, UMT5EncoderModel
from diffusers import (
AutoencoderKLWan,
UniPCMultistepScheduler,
WanAnimatePipeline,
WanAnimateTransformer3DModel,
WanImageToVideoPipeline,
WanPipeline,
WanTransformer3DModel,
@@ -114,203 +105,8 @@ VACE_TRANSFORMER_KEYS_RENAME_DICT = {
"after_proj": "proj_out",
}
ANIMATE_TRANSFORMER_KEYS_RENAME_DICT = {
"time_embedding.0": "condition_embedder.time_embedder.linear_1",
"time_embedding.2": "condition_embedder.time_embedder.linear_2",
"text_embedding.0": "condition_embedder.text_embedder.linear_1",
"text_embedding.2": "condition_embedder.text_embedder.linear_2",
"time_projection.1": "condition_embedder.time_proj",
"head.modulation": "scale_shift_table",
"head.head": "proj_out",
"modulation": "scale_shift_table",
"ffn.0": "ffn.net.0.proj",
"ffn.2": "ffn.net.2",
# Hack to swap the layer names
# The original model calls the norms in following order: norm1, norm3, norm2
# We convert it to: norm1, norm2, norm3
"norm2": "norm__placeholder",
"norm3": "norm2",
"norm__placeholder": "norm3",
"img_emb.proj.0": "condition_embedder.image_embedder.norm1",
"img_emb.proj.1": "condition_embedder.image_embedder.ff.net.0.proj",
"img_emb.proj.3": "condition_embedder.image_embedder.ff.net.2",
"img_emb.proj.4": "condition_embedder.image_embedder.norm2",
# Add attention component mappings
"self_attn.q": "attn1.to_q",
"self_attn.k": "attn1.to_k",
"self_attn.v": "attn1.to_v",
"self_attn.o": "attn1.to_out.0",
"self_attn.norm_q": "attn1.norm_q",
"self_attn.norm_k": "attn1.norm_k",
"cross_attn.q": "attn2.to_q",
"cross_attn.k": "attn2.to_k",
"cross_attn.v": "attn2.to_v",
"cross_attn.o": "attn2.to_out.0",
"cross_attn.norm_q": "attn2.norm_q",
"cross_attn.norm_k": "attn2.norm_k",
"cross_attn.k_img": "attn2.to_k_img",
"cross_attn.v_img": "attn2.to_v_img",
"cross_attn.norm_k_img": "attn2.norm_k_img",
# After cross_attn -> attn2 rename, we need to rename the img keys
"attn2.to_k_img": "attn2.add_k_proj",
"attn2.to_v_img": "attn2.add_v_proj",
"attn2.norm_k_img": "attn2.norm_added_k",
# Wan Animate-specific mappings (motion encoder, face encoder, face adapter)
# Motion encoder mappings
# The name mapping is complicated for the convolutional part so we handle that in its own function
"motion_encoder.enc.fc": "motion_encoder.motion_network",
"motion_encoder.dec.direction.weight": "motion_encoder.motion_synthesis_weight",
# Face encoder mappings - CausalConv1d has a .conv submodule that we need to flatten
"face_encoder.conv1_local.conv": "face_encoder.conv1_local",
"face_encoder.conv2.conv": "face_encoder.conv2",
"face_encoder.conv3.conv": "face_encoder.conv3",
# Face adapter mappings are handled in a separate function
}
# TODO: Verify this and simplify if possible.
def convert_animate_motion_encoder_weights(key: str, state_dict: Dict[str, Any], final_conv_idx: int = 8) -> None:
"""
Convert all motion encoder weights for Animate model.
In the original model:
- All Linear layers in fc use EqualLinear
- All Conv2d layers in convs use EqualConv2d (except blur_conv which is initialized separately)
- Blur kernels are stored as buffers in Sequential modules
- ConvLayer is nn.Sequential with indices: [Blur (optional), EqualConv2d, FusedLeakyReLU (optional)]
Conversion strategy:
1. Drop .kernel buffers (blur kernels)
2. Rename sequential indices to named components (e.g., 0 -> conv2d, 1 -> bias_leaky_relu)
"""
# Skip if not a weight, bias, or kernel
if ".weight" not in key and ".bias" not in key and ".kernel" not in key:
return
# Handle Blur kernel buffers from original implementation.
# After renaming, these appear under: motion_encoder.res_blocks.*.conv{2,skip}.blur_kernel
# Diffusers constructs blur kernels as a non-persistent buffer so we must drop these keys
if ".kernel" in key and "motion_encoder" in key:
# Remove unexpected blur kernel buffers to avoid strict load errors
state_dict.pop(key, None)
return
# Rename Sequential indices to named components in ConvLayer and ResBlock
if ".enc.net_app.convs." in key and (".weight" in key or ".bias" in key):
parts = key.split(".")
# Find the sequential index (digit) after convs or after conv1/conv2/skip
# Examples:
# - enc.net_app.convs.0.0.weight -> conv_in.weight (initial conv layer weight)
# - enc.net_app.convs.0.1.bias -> conv_in.act_fn.bias (initial conv layer bias)
# - enc.net_app.convs.{n:1-7}.conv1.0.weight -> res_blocks.{(n-1):0-6}.conv1.weight (conv1 weight)
# - e.g. enc.net_app.convs.1.conv1.0.weight -> res_blocks.0.conv1.weight
# - enc.net_app.convs.{n:1-7}.conv1.1.bias -> res_blocks.{(n-1):0-6}.conv1.act_fn.bias (conv1 bias)
# - e.g. enc.net_app.convs.1.conv1.1.bias -> res_blocks.0.conv1.act_fn.bias
# - enc.net_app.convs.{n:1-7}.conv2.1.weight -> res_blocks.{(n-1):0-6}.conv2.weight (conv2 weight)
# - enc.net_app.convs.1.conv2.2.bias -> res_blocks.0.conv2.act_fn.bias (conv2 bias)
# - enc.net_app.convs.{n:1-7}.skip.1.weight -> res_blocks.{(n-1):0-6}.conv_skip.weight (skip conv weight)
# - enc.net_app.convs.8 -> conv_out (final conv layer)
convs_idx = parts.index("convs") if "convs" in parts else -1
if convs_idx >= 0 and len(parts) - convs_idx >= 2:
bias = False
# The nn.Sequential index will always follow convs
sequential_idx = int(parts[convs_idx + 1])
if sequential_idx == 0:
if key.endswith(".weight"):
new_key = "motion_encoder.conv_in.weight"
elif key.endswith(".bias"):
new_key = "motion_encoder.conv_in.act_fn.bias"
bias = True
elif sequential_idx == final_conv_idx:
if key.endswith(".weight"):
new_key = "motion_encoder.conv_out.weight"
else:
# Intermediate .convs. layers, which get mapped to .res_blocks.
prefix = "motion_encoder.res_blocks."
layer_name = parts[convs_idx + 2]
if layer_name == "skip":
layer_name = "conv_skip"
if key.endswith(".weight"):
param_name = "weight"
elif key.endswith(".bias"):
param_name = "act_fn.bias"
bias = True
suffix_parts = [str(sequential_idx - 1), layer_name, param_name]
suffix = ".".join(suffix_parts)
new_key = prefix + suffix
param = state_dict.pop(key)
if bias:
param = param.squeeze()
state_dict[new_key] = param
return
return
return
def convert_animate_face_adapter_weights(key: str, state_dict: Dict[str, Any]) -> None:
"""
Convert face adapter weights for the Animate model.
The original model uses a fused KV projection but the diffusers models uses separate K and V projections.
"""
# Skip if not a weight or bias
if ".weight" not in key and ".bias" not in key:
return
prefix = "face_adapter."
if ".fuser_blocks." in key:
parts = key.split(".")
module_list_idx = parts.index("fuser_blocks") if "fuser_blocks" in parts else -1
if module_list_idx >= 0 and (len(parts) - 1) - module_list_idx == 3:
block_idx = parts[module_list_idx + 1]
layer_name = parts[module_list_idx + 2]
param_name = parts[module_list_idx + 3]
if layer_name == "linear1_kv":
layer_name_k = "to_k"
layer_name_v = "to_v"
suffix_k = ".".join([block_idx, layer_name_k, param_name])
suffix_v = ".".join([block_idx, layer_name_v, param_name])
new_key_k = prefix + suffix_k
new_key_v = prefix + suffix_v
kv_proj = state_dict.pop(key)
k_proj, v_proj = torch.chunk(kv_proj, 2, dim=0)
state_dict[new_key_k] = k_proj
state_dict[new_key_v] = v_proj
return
else:
if layer_name == "q_norm":
new_layer_name = "norm_q"
elif layer_name == "k_norm":
new_layer_name = "norm_k"
elif layer_name == "linear1_q":
new_layer_name = "to_q"
elif layer_name == "linear2":
new_layer_name = "to_out"
suffix_parts = [block_idx, new_layer_name, param_name]
suffix = ".".join(suffix_parts)
new_key = prefix + suffix
state_dict[new_key] = state_dict.pop(key)
return
return
TRANSFORMER_SPECIAL_KEYS_REMAP = {}
VACE_TRANSFORMER_SPECIAL_KEYS_REMAP = {}
ANIMATE_TRANSFORMER_SPECIAL_KEYS_REMAP = {
"motion_encoder": convert_animate_motion_encoder_weights,
"face_adapter": convert_animate_face_adapter_weights,
}
def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
@@ -568,37 +364,6 @@ def get_transformer_config(model_type: str) -> Tuple[Dict[str, Any], ...]:
}
RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
elif model_type == "Wan2.2-Animate-14B":
config = {
"model_id": "Wan-AI/Wan2.2-Animate-14B",
"diffusers_config": {
"image_dim": 1280,
"added_kv_proj_dim": 5120,
"attention_head_dim": 128,
"cross_attn_norm": True,
"eps": 1e-06,
"ffn_dim": 13824,
"freq_dim": 256,
"in_channels": 36,
"num_attention_heads": 40,
"num_layers": 40,
"out_channels": 16,
"patch_size": (1, 2, 2),
"qk_norm": "rms_norm_across_heads",
"text_dim": 4096,
"rope_max_seq_len": 1024,
"pos_embed_seq_len": None,
"motion_encoder_size": 512, # Start of Wan Animate-specific configs
"motion_style_dim": 512,
"motion_dim": 20,
"motion_encoder_dim": 512,
"face_encoder_hidden_dim": 1024,
"face_encoder_num_heads": 4,
"inject_face_latents_blocks": 5,
},
}
RENAME_DICT = ANIMATE_TRANSFORMER_KEYS_RENAME_DICT
SPECIAL_KEYS_REMAP = ANIMATE_TRANSFORMER_SPECIAL_KEYS_REMAP
return config, RENAME_DICT, SPECIAL_KEYS_REMAP
@@ -615,12 +380,10 @@ def convert_transformer(model_type: str, stage: str = None):
original_state_dict = load_sharded_safetensors(model_dir)
with init_empty_weights():
if "Animate" in model_type:
transformer = WanAnimateTransformer3DModel.from_config(diffusers_config)
elif "VACE" in model_type:
transformer = WanVACETransformer3DModel.from_config(diffusers_config)
else:
if "VACE" not in model_type:
transformer = WanTransformer3DModel.from_config(diffusers_config)
else:
transformer = WanVACETransformer3DModel.from_config(diffusers_config)
for key in list(original_state_dict.keys()):
new_key = key[:]
@@ -634,12 +397,7 @@ def convert_transformer(model_type: str, stage: str = None):
continue
handler_fn_inplace(key, original_state_dict)
# Load state dict into the meta model, which will materialize the tensors
transformer.load_state_dict(original_state_dict, strict=True, assign=True)
# Move to CPU to ensure all tensors are materialized
transformer = transformer.to("cpu")
return transformer
@@ -1168,7 +926,7 @@ DTYPE_MAPPING = {
if __name__ == "__main__":
args = get_args()
if "Wan2.2" in args.model_type and "TI2V" not in args.model_type and "Animate" not in args.model_type:
if "Wan2.2" in args.model_type and "TI2V" not in args.model_type:
transformer = convert_transformer(args.model_type, stage="high_noise_model")
transformer_2 = convert_transformer(args.model_type, stage="low_noise_model")
else:
@@ -1184,7 +942,7 @@ if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl")
if "FLF2V" in args.model_type:
flow_shift = 16.0
elif "TI2V" in args.model_type or "Animate" in args.model_type:
elif "TI2V" in args.model_type:
flow_shift = 5.0
else:
flow_shift = 3.0
@@ -1196,8 +954,6 @@ if __name__ == "__main__":
if args.dtype != "none":
dtype = DTYPE_MAPPING[args.dtype]
transformer.to(dtype)
if transformer_2 is not None:
transformer_2.to(dtype)
if "Wan2.2" and "I2V" in args.model_type and "TI2V" not in args.model_type:
pipe = WanImageToVideoPipeline(
@@ -1260,21 +1016,6 @@ if __name__ == "__main__":
vae=vae,
scheduler=scheduler,
)
elif "Animate" in args.model_type:
image_encoder = CLIPVisionModel.from_pretrained(
"laion/CLIP-ViT-H-14-laion2B-s32B-b79K", torch_dtype=torch.bfloat16
)
image_processor = CLIPImageProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
pipe = WanAnimatePipeline(
transformer=transformer,
text_encoder=text_encoder,
tokenizer=tokenizer,
vae=vae,
scheduler=scheduler,
image_encoder=image_encoder,
image_processor=image_processor,
)
else:
pipe = WanPipeline(
transformer=transformer,
-4
View File
@@ -268,7 +268,6 @@ else:
"UNetSpatioTemporalConditionModel",
"UVit2DModel",
"VQModel",
"WanAnimateTransformer3DModel",
"WanTransformer3DModel",
"WanVACETransformer3DModel",
"attention_backend",
@@ -637,7 +636,6 @@ else:
"VisualClozeGenerationPipeline",
"VisualClozePipeline",
"VQDiffusionPipeline",
"WanAnimatePipeline",
"WanImageToVideoPipeline",
"WanPipeline",
"WanVACEPipeline",
@@ -979,7 +977,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
UNetSpatioTemporalConditionModel,
UVit2DModel,
VQModel,
WanAnimateTransformer3DModel,
WanTransformer3DModel,
WanVACETransformer3DModel,
attention_backend,
@@ -1318,7 +1315,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
VisualClozeGenerationPipeline,
VisualClozePipeline,
VQDiffusionPipeline,
WanAnimatePipeline,
WanImageToVideoPipeline,
WanPipeline,
WanVACEPipeline,
+1 -1
View File
@@ -373,7 +373,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
r"""
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
Flawed](https://huggingface.co/papers/2305.08891).
Flawed](https://arxiv.org/pdf/2305.08891.pdf).
Args:
noise_cfg (`torch.Tensor`):
+2 -2
View File
@@ -409,7 +409,7 @@ class VaeImageProcessor(ConfigMixin):
src_w = width if ratio < src_ratio else image.width * height // image.height
src_h = height if ratio >= src_ratio else image.height * width // image.width
resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION[self.config.resample])
resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION["lanczos"])
res = Image.new("RGB", (width, height))
res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
@@ -460,7 +460,7 @@ class VaeImageProcessor(ConfigMixin):
src_w = width if ratio > src_ratio else image.width * height // image.height
src_h = height if ratio <= src_ratio else image.height * width // image.width
resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION[self.config.resample])
resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION["lanczos"])
res = Image.new("RGB", (width, height))
res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
return res
-2
View File
@@ -108,7 +108,6 @@ if is_torch_available():
_import_structure["transformers.transformer_skyreels_v2"] = ["SkyReelsV2Transformer3DModel"]
_import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
_import_structure["transformers.transformer_wan"] = ["WanTransformer3DModel"]
_import_structure["transformers.transformer_wan_animate"] = ["WanAnimateTransformer3DModel"]
_import_structure["transformers.transformer_wan_vace"] = ["WanVACETransformer3DModel"]
_import_structure["unets.unet_1d"] = ["UNet1DModel"]
_import_structure["unets.unet_2d"] = ["UNet2DModel"]
@@ -215,7 +214,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
T5FilmDecoder,
Transformer2DModel,
TransformerTemporalModel,
WanAnimateTransformer3DModel,
WanTransformer3DModel,
WanVACETransformer3DModel,
)
+5 -11
View File
@@ -383,18 +383,12 @@ def _check_shape(
attn_mask: Optional[torch.Tensor] = None,
**kwargs,
) -> None:
# Expected shapes:
# query: (batch_size, seq_len_q, num_heads, head_dim)
# key: (batch_size, seq_len_kv, num_heads, head_dim)
# value: (batch_size, seq_len_kv, num_heads, head_dim)
# attn_mask: (seq_len_q, seq_len_kv) or (batch_size, seq_len_q, seq_len_kv)
# or (batch_size, num_heads, seq_len_q, seq_len_kv)
if query.shape[-1] != key.shape[-1]:
raise ValueError("Query and key must have the same head dimension.")
if key.shape[-3] != value.shape[-3]:
raise ValueError("Key and value must have the same sequence length.")
if attn_mask is not None and attn_mask.shape[-1] != key.shape[-3]:
raise ValueError("Attention mask must match the key's sequence length.")
raise ValueError("Query and key must have the same last dimension.")
if query.shape[-2] != value.shape[-2]:
raise ValueError("Query and value must have the same second to last dimension.")
if attn_mask is not None and attn_mask.shape[-1] != key.shape[-2]:
raise ValueError("Attention mask must match the key's second to last dimension.")
# ===== Helper functions =====
@@ -16,7 +16,7 @@
# QwenImageVAE is further fine-tuned from the Wan Video VAE to achieve improved performance.
# For more information about the Wan VAE, please refer to:
# - GitHub: https://github.com/Wan-Video/Wan2.1
# - Paper: https://huggingface.co/papers/2503.20314
# - arXiv: https://arxiv.org/abs/2503.20314
from typing import List, Optional, Tuple, Union
@@ -42,5 +42,4 @@ if is_torch_available():
from .transformer_skyreels_v2 import SkyReelsV2Transformer3DModel
from .transformer_temporal import TransformerTemporalModel
from .transformer_wan import WanTransformer3DModel
from .transformer_wan_animate import WanAnimateTransformer3DModel
from .transformer_wan_vace import WanVACETransformer3DModel
@@ -275,12 +275,7 @@ class PRXEmbedND(nn.Module):
def rope(self, pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
assert dim % 2 == 0
is_mps = pos.device.type == "mps"
is_npu = pos.device.type == "npu"
dtype = torch.float32 if (is_mps or is_npu) else torch.float64
scale = torch.arange(0, dim, 2, dtype=dtype, device=pos.device) / dim
scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
omega = 1.0 / (theta**scale)
out = pos.unsqueeze(-1) * omega.unsqueeze(0)
out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
@@ -188,11 +188,6 @@ class WanRotaryPosEmbed(nn.Module):
h_dim = w_dim = 2 * (attention_head_dim // 6)
t_dim = attention_head_dim - h_dim - w_dim
self.t_dim = t_dim
self.h_dim = h_dim
self.w_dim = w_dim
freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
freqs_cos = []
@@ -218,7 +213,11 @@ class WanRotaryPosEmbed(nn.Module):
p_t, p_h, p_w = self.patch_size
ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
split_sizes = [self.t_dim, self.h_dim, self.w_dim]
split_sizes = [
self.attention_head_dim - 2 * (self.attention_head_dim // 3),
self.attention_head_dim // 3,
self.attention_head_dim // 3,
]
freqs_cos = self.freqs_cos.split(split_sizes, dim=1)
freqs_sin = self.freqs_sin.split(split_sizes, dim=1)
File diff suppressed because it is too large Load Diff
@@ -861,10 +861,6 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
else:
sub_blocks[block_name] = block
self.sub_blocks = sub_blocks
if not len(self.block_names) == len(self.block_classes):
raise ValueError(
f"In {self.__class__.__name__}, the number of block_names and block_classes must be the same."
)
def _get_inputs(self):
inputs = []
@@ -132,7 +132,6 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
@property
def inputs(self) -> List[InputParam]:
return [
InputParam("latents"),
InputParam(name="height"),
InputParam(name="width"),
InputParam(name="num_images_per_prompt", default=1),
@@ -197,11 +196,11 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
f"You have passed a list of generators of length {len(block_state.generator)}, but requested an effective batch"
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
)
if block_state.latents is None:
block_state.latents = randn_tensor(
shape, generator=block_state.generator, device=device, dtype=block_state.dtype
)
block_state.latents = components.pachifier.pack_latents(block_state.latents)
block_state.latents = randn_tensor(
shape, generator=block_state.generator, device=device, dtype=block_state.dtype
)
block_state.latents = components.pachifier.pack_latents(block_state.latents)
self.set_block_state(state, block_state)
return components, state
@@ -550,7 +549,8 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
block_state.width // components.vae_scale_factor // 2,
)
]
] * block_state.batch_size
* block_state.batch_size
]
block_state.txt_seq_lens = (
block_state.prompt_embeds_mask.sum(dim=1).tolist() if block_state.prompt_embeds_mask is not None else None
)
@@ -74,9 +74,8 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
block_state = self.get_block_state(state)
# YiYi Notes: remove support for output_type = "latents', we can just skip decode/encode step in modular
vae_scale_factor = components.vae_scale_factor
block_state.latents = components.pachifier.unpack_latents(
block_state.latents, block_state.height, block_state.width, vae_scale_factor=vae_scale_factor
block_state.latents, block_state.height, block_state.width
)
block_state.latents = block_state.latents.to(components.vae.dtype)
@@ -503,8 +503,6 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks):
block_state.prompt_embeds = block_state.prompt_embeds[:, : block_state.max_sequence_length]
block_state.prompt_embeds_mask = block_state.prompt_embeds_mask[:, : block_state.max_sequence_length]
block_state.negative_prompt_embeds = None
block_state.negative_prompt_embeds_mask = None
if components.requires_unconditional_embeds:
negative_prompt = block_state.negative_prompt or ""
block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds(
@@ -629,8 +627,6 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
device=device,
)
block_state.negative_prompt_embeds = None
block_state.negative_prompt_embeds_mask = None
if components.requires_unconditional_embeds:
negative_prompt = block_state.negative_prompt or " "
block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds_edit(
@@ -683,8 +679,6 @@ class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
device=device,
)
block_state.negative_prompt_embeds = None
block_state.negative_prompt_embeds_mask = None
if components.requires_unconditional_embeds:
negative_prompt = block_state.negative_prompt or " "
block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = (
@@ -523,7 +523,7 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
QwenImageOptionalControlNetBeforeDenoiseStep,
QwenImageAutoDenoiseStep,
]
block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise"]
block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise", "decode"]
@property
def description(self):
@@ -534,6 +534,7 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
+ " - `QwenImageAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
+ " - `QwenImageOptionalControlNetBeforeDenoiseStep` (controlnet_before_denoise) prepares the controlnet input for the denoising step.\n"
+ " - `QwenImageAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
+ " - `QwenImageAutoDecodeStep` (decode) decodes the latents into images.\n\n"
+ "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n"
+ " - for image-to-image generation, you need to provide `image_latents`\n"
+ " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n"
@@ -26,7 +26,10 @@ class QwenImagePachifier(ConfigMixin):
config_name = "config.json"
@register_to_config
def __init__(self, patch_size: int = 2):
def __init__(
self,
patch_size: int = 2,
):
super().__init__()
def pack_latents(self, latents):
+2 -14
View File
@@ -385,13 +385,7 @@ else:
"WuerstchenDecoderPipeline",
"WuerstchenPriorPipeline",
]
_import_structure["wan"] = [
"WanPipeline",
"WanImageToVideoPipeline",
"WanVideoToVideoPipeline",
"WanVACEPipeline",
"WanAnimatePipeline",
]
_import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline", "WanVideoToVideoPipeline", "WanVACEPipeline"]
_import_structure["kandinsky5"] = ["Kandinsky5T2VPipeline"]
_import_structure["skyreels_v2"] = [
"SkyReelsV2DiffusionForcingPipeline",
@@ -809,13 +803,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
UniDiffuserTextDecoder,
)
from .visualcloze import VisualClozeGenerationPipeline, VisualClozePipeline
from .wan import (
WanAnimatePipeline,
WanImageToVideoPipeline,
WanPipeline,
WanVACEPipeline,
WanVideoToVideoPipeline,
)
from .wan import WanImageToVideoPipeline, WanPipeline, WanVACEPipeline, WanVideoToVideoPipeline
from .wuerstchen import (
WuerstchenCombinedPipeline,
WuerstchenDecoderPipeline,
@@ -245,7 +245,7 @@ class BriaPipeline(DiffusionPipeline):
return self._guidance_scale
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
# corresponds to doing no classifier free guidance.
@property
def do_classifier_free_guidance(self):
@@ -489,11 +489,11 @@ class BriaPipeline(DiffusionPipeline):
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
passed will be used. Must be in descending order.
guidance_scale (`float`, *optional*, defaults to 5.0):
Guidance scale as defined in [Classifier-Free Diffusion
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
the text `prompt`, usually at the expense of lower image quality.
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
negative_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts not to guide the image generation. If not defined, one has to pass
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
@@ -337,7 +337,7 @@ class BriaFiboPipeline(DiffusionPipeline):
return self._guidance_scale
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
# of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
# corresponds to doing no classifier free guidance.
@property
@@ -498,11 +498,11 @@ class BriaFiboPipeline(DiffusionPipeline):
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
passed will be used. Must be in descending order.
guidance_scale (`float`, *optional*, defaults to 5.0):
Guidance scale as defined in [Classifier-Free Diffusion
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
the text `prompt`, usually at the expense of lower image quality.
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
negative_prompt (`str` or `List[str]`, *optional*):
The prompt or prompts not to guide the image generation. If not defined, one has to pass
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
+4 -5
View File
@@ -590,10 +590,9 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
the text `prompt`, usually at the expense of lower image quality.
guidance_rescale (`float`, *optional*, defaults to 0.0):
Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
[Common Diffusion Noise Schedules and Sample Steps are
Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
using zero terminal SNR.
Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
Guidance rescale factor should fix overexposure when using zero terminal SNR.
num_videos_per_prompt (`int`, *optional*, defaults to 1):
The number of videos to generate per prompt.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -778,7 +777,7 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
if self.guidance_rescale > 0:
# Based on 3.4. in https://huggingface.co/papers/2305.08891
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
noise_pred = rescale_noise_cfg(
noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
)
@@ -927,10 +927,9 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
the text `prompt`, usually at the expense of lower image quality.
guidance_rescale (`float`, *optional*, defaults to 0.0):
Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
[Common Diffusion Noise Schedules and Sample Steps are
Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
using zero terminal SNR.
Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
Guidance rescale factor should fix overexposure when using zero terminal SNR.
num_videos_per_prompt (`int`, *optional*, defaults to 1):
The number of videos to generate per prompt.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -1195,7 +1194,7 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
timestep, _ = timestep.chunk(2)
if self.guidance_rescale > 0:
# Based on 3.4. in https://huggingface.co/papers/2305.08891
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
noise_pred = rescale_noise_cfg(
noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
)
@@ -654,10 +654,9 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
the text `prompt`, usually at the expense of lower image quality.
guidance_rescale (`float`, *optional*, defaults to 0.0):
Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
[Common Diffusion Noise Schedules and Sample Steps are
Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
using zero terminal SNR.
Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
Guidance rescale factor should fix overexposure when using zero terminal SNR.
num_videos_per_prompt (`int`, *optional*, defaults to 1):
The number of videos to generate per prompt.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -852,7 +851,7 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
timestep, _ = timestep.chunk(2)
if self.guidance_rescale > 0:
# Based on 3.4. in https://huggingface.co/papers/2305.08891
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
noise_pred = rescale_noise_cfg(
noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
)
+9 -44
View File
@@ -69,39 +69,6 @@ ASPECT_RATIO_512_BIN = {
"2.0": [704, 352],
}
ASPECT_RATIO_1024_BIN = {
"0.49": [704, 1440],
"0.52": [736, 1408],
"0.53": [736, 1376],
"0.57": [768, 1344],
"0.59": [768, 1312],
"0.62": [800, 1280],
"0.67": [832, 1248],
"0.68": [832, 1216],
"0.78": [896, 1152],
"0.83": [928, 1120],
"0.94": [992, 1056],
"1.0": [1024, 1024],
"1.06": [1056, 992],
"1.13": [1088, 960],
"1.21": [1120, 928],
"1.29": [1152, 896],
"1.37": [1184, 864],
"1.46": [1216, 832],
"1.5": [1248, 832],
"1.71": [1312, 768],
"1.75": [1344, 768],
"1.87": [1376, 736],
"1.91": [1408, 736],
"2.05": [1440, 704],
}
ASPECT_RATIO_BINS = {
256: ASPECT_RATIO_256_BIN,
512: ASPECT_RATIO_512_BIN,
1024: ASPECT_RATIO_1024_BIN,
}
logger = logging.get_logger(__name__)
@@ -569,11 +536,11 @@ class PRXPipeline(
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
passed will be used. Must be in descending order.
guidance_scale (`float`, *optional*, defaults to 4.0):
Guidance scale as defined in [Classifier-Free Diffusion
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
the text `prompt`, usually at the expense of lower image quality.
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
num_images_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -633,12 +600,10 @@ class PRXPipeline(
"Resolution binning requires a VAE with image_processor, but VAE is not available. "
"Set use_resolution_binning=False or provide a VAE."
)
if self.default_sample_size not in ASPECT_RATIO_BINS:
raise ValueError(
f"Resolution binning is only supported for default_sample_size in {list(ASPECT_RATIO_BINS.keys())}, "
f"but got {self.default_sample_size}. Set use_resolution_binning=False to disable aspect ratio binning."
)
aspect_ratio_bin = ASPECT_RATIO_BINS[self.default_sample_size]
if self.default_sample_size <= 256:
aspect_ratio_bin = ASPECT_RATIO_256_BIN
else:
aspect_ratio_bin = ASPECT_RATIO_512_BIN
# Store original dimensions
orig_height, orig_width = height, width
@@ -415,11 +415,11 @@ class SkyReelsV2Pipeline(DiffusionPipeline, SkyReelsV2LoraLoaderMixin):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
guidance_scale (`float`, defaults to `6.0`):
Guidance scale as defined in [Classifier-Free Diffusion
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
the text `prompt`, usually at the expense of lower image quality.
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
num_videos_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -647,11 +647,11 @@ class SkyReelsV2DiffusionForcingPipeline(DiffusionPipeline, SkyReelsV2LoraLoader
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
guidance_scale (`float`, defaults to `6.0`):
Guidance scale as defined in [Classifier-Free Diffusion
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
the text `prompt`, usually at the expense of lower image quality. (**6.0 for T2V**, **5.0 for I2V**)
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality. (**6.0 for T2V**, **5.0 for I2V**)
num_videos_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -698,11 +698,11 @@ class SkyReelsV2DiffusionForcingImageToVideoPipeline(DiffusionPipeline, SkyReels
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
guidance_scale (`float`, defaults to `5.0`):
Guidance scale as defined in [Classifier-Free Diffusion
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
the text `prompt`, usually at the expense of lower image quality. (**6.0 for T2V**, **5.0 for I2V**)
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality. (**6.0 for T2V**, **5.0 for I2V**)
num_videos_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
@@ -524,11 +524,11 @@ class SkyReelsV2ImageToVideoPipeline(DiffusionPipeline, SkyReelsV2LoraLoaderMixi
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
guidance_scale (`float`, defaults to `5.0`):
Guidance scale as defined in [Classifier-Free Diffusion
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
the text `prompt`, usually at the expense of lower image quality.
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
num_videos_per_prompt (`int`, *optional*, defaults to 1):
The number of images to generate per prompt.
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+1 -2
View File
@@ -23,7 +23,6 @@ except OptionalDependencyNotAvailable:
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
else:
_import_structure["pipeline_wan"] = ["WanPipeline"]
_import_structure["pipeline_wan_animate"] = ["WanAnimatePipeline"]
_import_structure["pipeline_wan_i2v"] = ["WanImageToVideoPipeline"]
_import_structure["pipeline_wan_vace"] = ["WanVACEPipeline"]
_import_structure["pipeline_wan_video2video"] = ["WanVideoToVideoPipeline"]
@@ -36,10 +35,10 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
from ...utils.dummy_torch_and_transformers_objects import *
else:
from .pipeline_wan import WanPipeline
from .pipeline_wan_animate import WanAnimatePipeline
from .pipeline_wan_i2v import WanImageToVideoPipeline
from .pipeline_wan_vace import WanVACEPipeline
from .pipeline_wan_video2video import WanVideoToVideoPipeline
else:
import sys
@@ -1,185 +0,0 @@
# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional, Tuple, Union
import numpy as np
import PIL.Image
import torch
from ...configuration_utils import register_to_config
from ...image_processor import VaeImageProcessor
from ...utils import PIL_INTERPOLATION
class WanAnimateImageProcessor(VaeImageProcessor):
r"""
Image processor to preprocess the reference (character) image for the Wan Animate model.
Args:
do_resize (`bool`, *optional*, defaults to `True`):
Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept
`height` and `width` arguments from [`image_processor.VaeImageProcessor.preprocess`] method.
vae_scale_factor (`int`, *optional*, defaults to `8`):
VAE (spatial) scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of
this factor.
vae_latent_channels (`int`, *optional*, defaults to `16`):
VAE latent channels.
spatial_patch_size (`Tuple[int, int]`, *optional*, defaults to `(2, 2)`):
The spatial patch size used by the diffusion transformer. For Wan models, this is typically (2, 2).
resample (`str`, *optional*, defaults to `lanczos`):
Resampling filter to use when resizing the image.
do_normalize (`bool`, *optional*, defaults to `True`):
Whether to normalize the image to [-1,1].
do_binarize (`bool`, *optional*, defaults to `False`):
Whether to binarize the image to 0/1.
do_convert_rgb (`bool`, *optional*, defaults to be `False`):
Whether to convert the images to RGB format.
do_convert_grayscale (`bool`, *optional*, defaults to be `False`):
Whether to convert the images to grayscale format.
fill_color (`str` or `float` or `Tuple[float, ...]`, *optional*, defaults to `None`):
An optional fill color when `resize_mode` is set to `"fill"`. This will fill the empty space with that
color instead of filling with data from the image. Any valid `color` argument to `PIL.Image.new` is valid;
if `None`, will default to filling with data from `image`.
"""
@register_to_config
def __init__(
self,
do_resize: bool = True,
vae_scale_factor: int = 8,
vae_latent_channels: int = 16,
spatial_patch_size: Tuple[int, int] = (2, 2),
resample: str = "lanczos",
reducing_gap: int = None,
do_normalize: bool = True,
do_binarize: bool = False,
do_convert_rgb: bool = False,
do_convert_grayscale: bool = False,
fill_color: Optional[Union[str, float, Tuple[float, ...]]] = 0,
):
super().__init__()
if do_convert_rgb and do_convert_grayscale:
raise ValueError(
"`do_convert_rgb` and `do_convert_grayscale` can not both be set to `True`,"
" if you intended to convert the image into RGB format, please set `do_convert_grayscale = False`.",
" if you intended to convert the image into grayscale format, please set `do_convert_rgb = False`",
)
def _resize_and_fill(
self,
image: PIL.Image.Image,
width: int,
height: int,
) -> PIL.Image.Image:
r"""
Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center
the image within the dimensions, filling empty with data from image.
Args:
image (`PIL.Image.Image`):
The image to resize and fill.
width (`int`):
The width to resize the image to.
height (`int`):
The height to resize the image to.
Returns:
`PIL.Image.Image`:
The resized and filled image.
"""
ratio = width / height
src_ratio = image.width / image.height
fill_with_image_data = self.config.fill_color is None
fill_color = self.config.fill_color or 0
src_w = width if ratio < src_ratio else image.width * height // image.height
src_h = height if ratio >= src_ratio else image.height * width // image.width
resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION[self.config.resample])
res = PIL.Image.new("RGB", (width, height), color=fill_color)
res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
if fill_with_image_data:
if ratio < src_ratio:
fill_height = height // 2 - src_h // 2
if fill_height > 0:
res.paste(resized.resize((width, fill_height), box=(0, 0, width, 0)), box=(0, 0))
res.paste(
resized.resize((width, fill_height), box=(0, resized.height, width, resized.height)),
box=(0, fill_height + src_h),
)
elif ratio > src_ratio:
fill_width = width // 2 - src_w // 2
if fill_width > 0:
res.paste(resized.resize((fill_width, height), box=(0, 0, 0, height)), box=(0, 0))
res.paste(
resized.resize((fill_width, height), box=(resized.width, 0, resized.width, height)),
box=(fill_width + src_w, 0),
)
return res
def get_default_height_width(
self,
image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
height: Optional[int] = None,
width: Optional[int] = None,
) -> Tuple[int, int]:
r"""
Returns the height and width of the image, downscaled to the next integer multiple of `vae_scale_factor`.
Args:
image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
The image input, which can be a PIL image, NumPy array, or PyTorch tensor. If it is a NumPy array, it
should have shape `[batch, height, width]` or `[batch, height, width, channels]`. If it is a PyTorch
tensor, it should have shape `[batch, channels, height, width]`.
height (`Optional[int]`, *optional*, defaults to `None`):
The height of the preprocessed image. If `None`, the height of the `image` input will be used.
width (`Optional[int]`, *optional*, defaults to `None`):
The width of the preprocessed image. If `None`, the width of the `image` input will be used.
Returns:
`Tuple[int, int]`:
A tuple containing the height and width, both resized to the nearest integer multiple of
`vae_scale_factor * spatial_patch_size`.
"""
if height is None:
if isinstance(image, PIL.Image.Image):
height = image.height
elif isinstance(image, torch.Tensor):
height = image.shape[2]
else:
height = image.shape[1]
if width is None:
if isinstance(image, PIL.Image.Image):
width = image.width
elif isinstance(image, torch.Tensor):
width = image.shape[3]
else:
width = image.shape[2]
max_area = width * height
aspect_ratio = height / width
mod_value_h = self.config.vae_scale_factor * self.config.spatial_patch_size[0]
mod_value_w = self.config.vae_scale_factor * self.config.spatial_patch_size[1]
# Try to preserve the aspect ratio
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value_h * mod_value_h
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value_w * mod_value_w
return height, width
File diff suppressed because it is too large Load Diff
@@ -758,11 +758,11 @@ class WanVACEPipeline(DiffusionPipeline, WanLoraLoaderMixin):
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
expense of slower inference.
guidance_scale (`float`, defaults to `5.0`):
Guidance scale as defined in [Classifier-Free Diffusion
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
the text `prompt`, usually at the expense of lower image quality.
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
`guidance_scale` is defined as `w` of equation 2. of [Imagen
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
usually at the expense of lower image quality.
guidance_scale_2 (`float`, *optional*, defaults to `None`):
Guidance scale for the low-noise stage transformer (`transformer_2`). If `None` and the pipeline's
`boundary_ratio` is not None, uses the same value as `guidance_scale`. Only used when `transformer_2`
+19 -132
View File
@@ -1,6 +1,6 @@
import math
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import torch
@@ -9,48 +9,13 @@ from ..utils import BaseOutput
from .scheduling_utils import SchedulerMixin
def gumbel_noise(t: torch.Tensor, generator: Optional[torch.Generator] = None) -> torch.Tensor:
"""
Generate Gumbel noise for sampling.
Args:
t (`torch.Tensor`):
Input tensor to match the shape and dtype of the output noise.
generator (`torch.Generator`, *optional*):
A random number generator for reproducible sampling.
Returns:
`torch.Tensor`:
Gumbel-distributed noise with the same shape, dtype, and device as the input tensor.
"""
def gumbel_noise(t, generator=None):
device = generator.device if generator is not None else t.device
noise = torch.zeros_like(t, device=device).uniform_(0, 1, generator=generator).to(t.device)
return -torch.log((-torch.log(noise.clamp(1e-20))).clamp(1e-20))
def mask_by_random_topk(
mask_len: torch.Tensor,
probs: torch.Tensor,
temperature: float = 1.0,
generator: Optional[torch.Generator] = None,
) -> torch.Tensor:
"""
Mask tokens by selecting the top-k lowest confidence scores with temperature-based randomness.
Args:
mask_len (`torch.Tensor`):
Number of tokens to mask per sample in the batch.
probs (`torch.Tensor`):
Probability scores for each token.
temperature (`float`, *optional*, defaults to 1.0):
Temperature parameter for controlling randomness in the masking process.
generator (`torch.Generator`, *optional*):
A random number generator for reproducible sampling.
Returns:
`torch.Tensor`:
Boolean mask indicating which tokens should be masked.
"""
def mask_by_random_topk(mask_len, probs, temperature=1.0, generator=None):
confidence = torch.log(probs.clamp(1e-20)) + temperature * gumbel_noise(probs, generator=generator)
sorted_confidence = torch.sort(confidence, dim=-1).values
cut_off = torch.gather(sorted_confidence, 1, mask_len.long())
@@ -64,46 +29,28 @@ class AmusedSchedulerOutput(BaseOutput):
Output class for the scheduler's `step` function output.
Args:
prev_sample (`torch.LongTensor` of shape `(batch_size, height, width)` or `(batch_size, sequence_length)`):
Computed sample `(x_{t-1})` of previous timestep with token IDs. `prev_sample` should be used as next model
input in the denoising loop.
pred_original_sample (`torch.LongTensor` of shape `(batch_size, height, width)` or `(batch_size, sequence_length)`, *optional*):
The predicted fully denoised sample `(x_{0})` with token IDs based on the model output from the current
timestep. `pred_original_sample` can be used to preview progress or for guidance.
prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
denoising loop.
pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
`pred_original_sample` can be used to preview progress or for guidance.
"""
prev_sample: torch.Tensor
pred_original_sample: Optional[torch.Tensor] = None
pred_original_sample: torch.Tensor = None
class AmusedScheduler(SchedulerMixin, ConfigMixin):
"""
A scheduler for masked token generation as used in [`AmusedPipeline`].
This scheduler iteratively unmasks tokens based on their confidence scores, following either a cosine or linear
schedule. Unlike traditional diffusion schedulers that work with continuous pixel values, this scheduler operates
on discrete token IDs, making it suitable for autoregressive and non-autoregressive masked token generation models.
This scheduler inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the
generic methods the library implements for all schedulers such as loading and saving.
Args:
mask_token_id (`int`):
The token ID used to represent masked tokens in the sequence.
masking_schedule (`Literal["cosine", "linear"]`, *optional*, defaults to `"cosine"`):
The schedule type for determining the mask ratio at each timestep. Can be either `"cosine"` or `"linear"`.
"""
order = 1
temperatures: Optional[torch.Tensor]
timesteps: Optional[torch.Tensor]
temperatures: torch.Tensor
@register_to_config
def __init__(
self,
mask_token_id: int,
masking_schedule: Literal["cosine", "linear"] = "cosine",
masking_schedule: str = "cosine",
):
self.temperatures = None
self.timesteps = None
@@ -111,23 +58,9 @@ class AmusedScheduler(SchedulerMixin, ConfigMixin):
def set_timesteps(
self,
num_inference_steps: int,
temperature: Union[float, Tuple[float, float], List[float]] = (2, 0),
device: Optional[Union[str, torch.device]] = None,
) -> None:
"""
Set the discrete timesteps used for the diffusion chain (to be run before inference).
Args:
num_inference_steps (`int`):
The number of diffusion steps used when generating samples with a pre-trained model.
temperature (`Union[float, Tuple[float, float], List[float]]`, *optional*, defaults to `(2, 0)`):
Temperature parameter(s) for controlling the randomness of sampling. If a tuple or list is provided,
temperatures will be linearly interpolated between the first and second values across all timesteps. If
a single value is provided, temperatures will be linearly interpolated from that value to 0.01.
device (`str` or `torch.device`, *optional*):
The device to which the timesteps and temperatures should be moved to. If `None`, the timesteps are not
moved.
"""
temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),
device: Union[str, torch.device] = None,
):
self.timesteps = torch.arange(num_inference_steps, device=device).flip(0)
if isinstance(temperature, (tuple, list)):
@@ -138,38 +71,12 @@ class AmusedScheduler(SchedulerMixin, ConfigMixin):
def step(
self,
model_output: torch.Tensor,
timestep: int,
timestep: torch.long,
sample: torch.LongTensor,
starting_mask_ratio: float = 1.0,
starting_mask_ratio: int = 1,
generator: Optional[torch.Generator] = None,
return_dict: bool = True,
) -> Union[AmusedSchedulerOutput, Tuple[torch.Tensor, torch.Tensor]]:
"""
Predict the sample at the previous timestep by masking tokens based on confidence scores.
Args:
model_output (`torch.Tensor`):
The direct output from the learned diffusion model. Typically of shape `(batch_size, num_tokens,
codebook_size)` or `(batch_size, codebook_size, height, width)` for 2D inputs.
timestep (`int`):
The current discrete timestep in the diffusion chain.
sample (`torch.LongTensor`):
A current instance of a sample created by the diffusion process. Contains token IDs, with masked
positions indicated by `mask_token_id`.
starting_mask_ratio (`float`, *optional*, defaults to 1.0):
A multiplier applied to the mask ratio schedule. Values less than 1.0 will result in fewer tokens being
masked at each step.
generator (`torch.Generator`, *optional*):
A random number generator for reproducible sampling.
return_dict (`bool`, *optional*, defaults to `True`):
Whether to return an [`~schedulers.scheduling_amused.AmusedSchedulerOutput`] or a plain tuple.
Returns:
[`~schedulers.scheduling_amused.AmusedSchedulerOutput`] or `tuple`:
If `return_dict` is `True`, [`~schedulers.scheduling_amused.AmusedSchedulerOutput`] is returned,
otherwise a tuple is returned where the first element is the sample tensor (`prev_sample`) and the
second element is the predicted original sample tensor (`pred_original_sample`).
"""
) -> Union[AmusedSchedulerOutput, Tuple]:
two_dim_input = sample.ndim == 3 and model_output.ndim == 4
if two_dim_input:
@@ -230,27 +137,7 @@ class AmusedScheduler(SchedulerMixin, ConfigMixin):
return AmusedSchedulerOutput(prev_sample, pred_original_sample)
def add_noise(
self,
sample: torch.LongTensor,
timesteps: int,
generator: Optional[torch.Generator] = None,
) -> torch.LongTensor:
"""
Add noise to a sample by randomly masking tokens according to the masking schedule.
Args:
sample (`torch.LongTensor`):
The input sample containing token IDs to be partially masked.
timesteps (`int`):
The timestep that determines how much masking to apply. Higher timesteps result in more masking.
generator (`torch.Generator`, *optional*):
A random number generator for reproducible masking.
Returns:
`torch.LongTensor`:
The sample with some tokens replaced by `mask_token_id` according to the masking schedule.
"""
def add_noise(self, sample, timesteps, generator=None):
step_idx = (self.timesteps == timesteps).nonzero()
ratio = (step_idx + 1) / len(self.timesteps)
@@ -1,6 +1,6 @@
import math
from dataclasses import dataclass
from typing import Literal, Optional, Tuple, Union
from typing import Optional, Tuple, Union
import torch
@@ -12,10 +12,10 @@ from .scheduling_utils import SchedulerMixin
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -23,17 +23,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -121,7 +121,7 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -287,23 +287,7 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
return c_skip, c_out
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
def index_for_timestep(
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
) -> int:
"""
Find the index of a given timestep in the timestep schedule.
Args:
timestep (`float` or `torch.Tensor`):
The timestep value to find in the schedule.
schedule_timesteps (`torch.Tensor`, *optional*):
The timestep schedule to search in. If `None`, uses `self.timesteps`.
Returns:
`int`:
The index of the timestep in the schedule. For the very first step, returns the second index if
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
"""
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
@@ -318,14 +302,7 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
return indices[pos].item()
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
"""
Initialize the step index for the scheduler based on the given timestep.
Args:
timestep (`float` or `torch.Tensor`):
The current timestep to initialize the step index from.
"""
def _init_step_index(self, timestep):
if self.begin_index is None:
if isinstance(timestep, torch.Tensor):
timestep = timestep.to(self.timesteps.device)
@@ -433,21 +410,6 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.Tensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise schedule at the specified timesteps.
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise tensor to add to the original samples.
timesteps (`torch.Tensor`):
The timesteps at which to add noise, determining the noise level from the schedule.
Returns:
`torch.Tensor`:
The noisy samples with added noise scaled according to the timestep schedule.
"""
# Make sure sigmas and timesteps have the same device and dtype as original_samples
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
@@ -137,7 +137,7 @@ class CosineDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -266,19 +266,6 @@ class CosineDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
def _sigma_to_t(self, sigma, log_sigmas):
"""
Convert sigma values to corresponding timestep values through interpolation.
Args:
sigma (`np.ndarray`):
The sigma value(s) to convert to timestep(s).
log_sigmas (`np.ndarray`):
The logarithm of the sigma schedule used for interpolation.
Returns:
`np.ndarray`:
The interpolated timestep value(s) corresponding to the input sigma(s).
"""
# get log sigma
log_sigma = np.log(np.maximum(sigma, 1e-10))
@@ -550,21 +537,6 @@ class CosineDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.Tensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise schedule at the specified timesteps.
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise tensor to add to the original samples.
timesteps (`torch.Tensor`):
The timesteps at which to add noise, determining the noise level from the schedule.
Returns:
`torch.Tensor`:
The noisy samples with added noise scaled according to the timestep schedule.
"""
# Make sure sigmas and timesteps have the same device and dtype as original_samples
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+39 -106
View File
@@ -17,7 +17,7 @@
import math
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -49,10 +49,10 @@ class DDIMSchedulerOutput(BaseOutput):
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -60,17 +60,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -93,17 +92,17 @@ def betas_for_alpha_bar(
return torch.tensor(betas, dtype=torch.float32)
def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
def rescale_zero_terminal_snr(betas):
"""
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
Args:
betas (`torch.Tensor`):
The betas that the scheduler is being initialized with.
the betas that the scheduler is being initialized with.
Returns:
`torch.Tensor`:
Rescaled betas with zero terminal SNR.
`torch.Tensor`: rescaled betas with zero terminal SNR
"""
# Convert betas to alphas_bar_sqrt
alphas = 1.0 - betas
@@ -144,9 +143,9 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
The starting `beta` value of inference.
beta_end (`float`, defaults to 0.02):
The final `beta` value.
beta_schedule (`Literal["linear", "scaled_linear", "squaredcos_cap_v2"]`, defaults to `"linear"`):
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Must be one
of `"linear"`, `"scaled_linear"`, or `"squaredcos_cap_v2"`.
beta_schedule (`str`, defaults to `"linear"`):
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
`linear`, `scaled_linear`, or `squaredcos_cap_v2`.
trained_betas (`np.ndarray`, *optional*):
Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
clip_sample (`bool`, defaults to `True`):
@@ -159,10 +158,10 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
otherwise it uses the alpha value at step 0.
steps_offset (`int`, defaults to 0):
An offset added to the inference steps, as required by some model families.
prediction_type (`Literal["epsilon", "sample", "v_prediction"]`, defaults to `"epsilon"`):
Prediction type of the scheduler function. Must be one of `"epsilon"` (predicts the noise of the diffusion
process), `"sample"` (directly predicts the noisy sample), or `"v_prediction"` (see section 2.4 of [Imagen
Video](https://huggingface.co/papers/2210.02303) paper).
prediction_type (`str`, defaults to `epsilon`, *optional*):
Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
`sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
Video](https://imagen.research.google/video/paper.pdf) paper).
thresholding (`bool`, defaults to `False`):
Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
as Stable Diffusion.
@@ -170,10 +169,9 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
sample_max_value (`float`, defaults to 1.0):
The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
timestep_spacing (`Literal["leading", "trailing", "linspace"]`, defaults to `"leading"`):
The way the timesteps should be scaled. Must be one of `"leading"`, `"trailing"`, or `"linspace"`. Refer to
Table 2 of the [Common Diffusion Noise Schedules and Sample Steps are
Flawed](https://huggingface.co/papers/2305.08891) for more information.
timestep_spacing (`str`, defaults to `"leading"`):
The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
rescale_betas_zero_snr (`bool`, defaults to `False`):
Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
dark samples instead of limiting it to samples with medium brightness. Loosely related to
@@ -189,17 +187,17 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
num_train_timesteps: int = 1000,
beta_start: float = 0.0001,
beta_end: float = 0.02,
beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
beta_schedule: str = "linear",
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
clip_sample: bool = True,
set_alpha_to_one: bool = True,
steps_offset: int = 0,
prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
prediction_type: str = "epsilon",
thresholding: bool = False,
dynamic_thresholding_ratio: float = 0.995,
clip_sample_range: float = 1.0,
sample_max_value: float = 1.0,
timestep_spacing: Literal["leading", "trailing", "linspace"] = "leading",
timestep_spacing: str = "leading",
rescale_betas_zero_snr: bool = False,
):
if trained_betas is not None:
@@ -252,25 +250,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
"""
return sample
def _get_variance(self, timestep: int, prev_timestep: int) -> torch.Tensor:
"""
Computes the variance of the noise added at a given diffusion step.
For a given `timestep` and its previous step, this method calculates the variance as defined in DDIM/DDPM
literature:
var_t = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
where alpha_prod and beta_prod are cumulative products of alphas and betas, respectively.
Args:
timestep (`int`):
The current timestep in the diffusion process.
prev_timestep (`int`):
The previous timestep in the diffusion process. If negative, uses `final_alpha_cumprod`.
Returns:
`torch.Tensor`:
The variance for the current timestep.
"""
def _get_variance(self, timestep, prev_timestep):
alpha_prod_t = self.alphas_cumprod[timestep]
alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
beta_prod_t = 1 - alpha_prod_t
@@ -283,8 +263,6 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
"""
Apply dynamic thresholding to the predicted sample.
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -292,14 +270,6 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
photorealism as well as better image-text alignment, especially when using very large guidance weights."
https://huggingface.co/papers/2205.11487
Args:
sample (`torch.Tensor`):
The predicted sample to be thresholded.
Returns:
`torch.Tensor`:
The thresholded sample.
"""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
@@ -324,18 +294,13 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
return sample
def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None) -> None:
def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
"""
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
Args:
num_inference_steps (`int`):
The number of diffusion steps used when generating samples with a pre-trained model.
device (`Union[str, torch.device]`, *optional*):
The device to use for the timesteps.
Raises:
ValueError: If `num_inference_steps` is larger than `self.config.num_train_timesteps`.
"""
if num_inference_steps > self.config.num_train_timesteps:
@@ -381,7 +346,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
sample: torch.Tensor,
eta: float = 0.0,
use_clipped_model_output: bool = False,
generator: Optional[torch.Generator] = None,
generator=None,
variance_noise: Optional[torch.Tensor] = None,
return_dict: bool = True,
) -> Union[DDIMSchedulerOutput, Tuple]:
@@ -392,21 +357,20 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
Args:
model_output (`torch.Tensor`):
The direct output from learned diffusion model.
timestep (`int`):
timestep (`float`):
The current discrete timestep in the diffusion chain.
sample (`torch.Tensor`):
A current instance of a sample created by the diffusion process.
eta (`float`, *optional*, defaults to 0.0):
The weight of noise for added noise in diffusion step. A value of 0 corresponds to DDIM (deterministic)
and 1 corresponds to DDPM (fully stochastic).
use_clipped_model_output (`bool`, *optional*, defaults to `False`):
eta (`float`):
The weight of noise for added noise in diffusion step.
use_clipped_model_output (`bool`, defaults to `False`):
If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
`use_clipped_model_output` has no effect.
generator (`torch.Generator`, *optional*):
A random number generator for reproducible sampling.
variance_noise (`torch.Tensor`, *optional*):
A random number generator.
variance_noise (`torch.Tensor`):
Alternative to generating noise with `generator` by directly providing the noise for the variance
itself. Useful for methods such as [`CycleDiffusion`].
return_dict (`bool`, *optional*, defaults to `True`):
@@ -513,22 +477,6 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.IntTensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
diffusion process).
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise to add to the samples.
timesteps (`torch.IntTensor`):
The timesteps indicating the noise level for each sample.
Returns:
`torch.Tensor`:
The noisy samples.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
# for the subsequent add_noise calls
@@ -551,21 +499,6 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
"""
Compute the velocity prediction from the sample and noise according to the velocity formula.
Args:
sample (`torch.Tensor`):
The input sample.
noise (`torch.Tensor`):
The noise tensor.
timesteps (`torch.IntTensor`):
The timesteps for velocity computation.
Returns:
`torch.Tensor`:
The computed velocity.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as sample
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
@@ -584,5 +517,5 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
return velocity
def __len__(self) -> int:
def __len__(self):
return self.config.num_train_timesteps
@@ -18,7 +18,7 @@
import math
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -49,10 +49,10 @@ class DDIMSchedulerOutput(BaseOutput):
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -60,17 +60,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -409,22 +408,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.IntTensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
diffusion process).
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise to add to the samples.
timesteps (`torch.IntTensor`):
The timesteps indicating the noise level for each sample.
Returns:
`torch.Tensor`:
The noisy samples.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
# for the subsequent add_noise calls
@@ -447,21 +430,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
"""
Compute the velocity prediction from the sample and noise according to the velocity formula.
Args:
sample (`torch.Tensor`):
The input sample.
noise (`torch.Tensor`):
The noise tensor.
timesteps (`torch.IntTensor`):
The timesteps for velocity computation.
Returns:
`torch.Tensor`:
The computed velocity.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as sample
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
@@ -16,7 +16,7 @@
# and https://github.com/hojonathanho/diffusion
import math
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -47,10 +47,10 @@ class DDIMSchedulerOutput(BaseOutput):
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -58,17 +58,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -96,13 +95,13 @@ def rescale_zero_terminal_snr(betas):
"""
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
Args:
betas (`torch.Tensor`):
The betas that the scheduler is being initialized with.
the betas that the scheduler is being initialized with.
Returns:
`torch.Tensor`:
Rescaled betas with zero terminal SNR.
`torch.Tensor`: rescaled betas with zero terminal SNR
"""
# Convert betas to alphas_bar_sqrt
alphas = 1.0 - betas
@@ -17,7 +17,7 @@
import math
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -49,10 +49,10 @@ class DDIMParallelSchedulerOutput(BaseOutput):
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -60,17 +60,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -98,13 +97,13 @@ def rescale_zero_terminal_snr(betas):
"""
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
Args:
betas (`torch.Tensor`):
The betas that the scheduler is being initialized with.
the betas that the scheduler is being initialized with.
Returns:
`torch.Tensor`:
Rescaled betas with zero terminal SNR.
`torch.Tensor`: rescaled betas with zero terminal SNR
"""
# Convert betas to alphas_bar_sqrt
alphas = 1.0 - betas
@@ -195,17 +194,17 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
num_train_timesteps: int = 1000,
beta_start: float = 0.0001,
beta_end: float = 0.02,
beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
beta_schedule: str = "linear",
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
clip_sample: bool = True,
set_alpha_to_one: bool = True,
steps_offset: int = 0,
prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
prediction_type: str = "epsilon",
thresholding: bool = False,
dynamic_thresholding_ratio: float = 0.995,
clip_sample_range: float = 1.0,
sample_max_value: float = 1.0,
timestep_spacing: Literal["leading", "trailing", "linspace"] = "leading",
timestep_spacing: str = "leading",
rescale_betas_zero_snr: bool = False,
):
if trained_betas is not None:
@@ -286,8 +285,6 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
"""
Apply dynamic thresholding to the predicted sample.
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -295,14 +292,6 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
photorealism as well as better image-text alignment, especially when using very large guidance weights."
https://huggingface.co/papers/2205.11487
Args:
sample (`torch.Tensor`):
The predicted sample to be thresholded.
Returns:
`torch.Tensor`:
The thresholded sample.
"""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
@@ -335,11 +324,6 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
Args:
num_inference_steps (`int`):
The number of diffusion steps used when generating samples with a pre-trained model.
device (`Union[str, torch.device]`, *optional*):
The device to use for the timesteps.
Raises:
ValueError: If `num_inference_steps` is larger than `self.config.num_train_timesteps`.
"""
if num_inference_steps > self.config.num_train_timesteps:
@@ -618,22 +602,6 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.IntTensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
diffusion process).
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise to add to the samples.
timesteps (`torch.IntTensor`):
The timesteps indicating the noise level for each sample.
Returns:
`torch.Tensor`:
The noisy samples.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
# for the subsequent add_noise calls
@@ -656,21 +624,6 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
"""
Compute the velocity prediction from the sample and noise according to the velocity formula.
Args:
sample (`torch.Tensor`):
The input sample.
noise (`torch.Tensor`):
The noise tensor.
timesteps (`torch.IntTensor`):
The timesteps for velocity computation.
Returns:
`torch.Tensor`:
The computed velocity.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as sample
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
+42 -118
View File
@@ -16,7 +16,7 @@
import math
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -46,10 +46,10 @@ class DDPMSchedulerOutput(BaseOutput):
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -57,17 +57,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -91,17 +90,17 @@ def betas_for_alpha_bar(
# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
def rescale_zero_terminal_snr(betas):
"""
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
Args:
betas (`torch.Tensor`):
The betas that the scheduler is being initialized with.
the betas that the scheduler is being initialized with.
Returns:
`torch.Tensor`:
Rescaled betas with zero terminal SNR.
`torch.Tensor`: rescaled betas with zero terminal SNR
"""
# Convert betas to alphas_bar_sqrt
alphas = 1.0 - betas
@@ -135,37 +134,39 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
methods the library implements for all schedulers such as loading and saving.
Args:
num_train_timesteps (`int`, defaults to `1000`):
num_train_timesteps (`int`, defaults to 1000):
The number of diffusion steps to train the model.
beta_start (`float`, defaults to `0.0001`):
beta_start (`float`, defaults to 0.0001):
The starting `beta` value of inference.
beta_end (`float`, defaults to `0.02`):
beta_end (`float`, defaults to 0.02):
The final `beta` value.
beta_schedule (`"linear"`, `"scaled_linear"`, `"squaredcos_cap_v2"`, or `"sigmoid"`, defaults to `"linear"`):
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model.
beta_schedule (`str`, defaults to `"linear"`):
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
`linear`, `scaled_linear`, `squaredcos_cap_v2`, or `sigmoid`.
trained_betas (`np.ndarray`, *optional*):
An array of betas to pass directly to the constructor without using `beta_start` and `beta_end`.
variance_type (`"fixed_small"`, `"fixed_small_log"`, `"fixed_large"`, `"fixed_large_log"`, `"learned"`, or `"learned_range"`, defaults to `"fixed_small"`):
Clip the variance when adding noise to the denoised sample.
variance_type (`str`, defaults to `"fixed_small"`):
Clip the variance when adding noise to the denoised sample. Choose from `fixed_small`, `fixed_small_log`,
`fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
clip_sample (`bool`, defaults to `True`):
Clip the predicted sample for numerical stability.
clip_sample_range (`float`, defaults to `1.0`):
clip_sample_range (`float`, defaults to 1.0):
The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
prediction_type (`"epsilon"`, `"sample"`, or `"v_prediction"`, defaults to `"epsilon"`):
prediction_type (`str`, defaults to `epsilon`, *optional*):
Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
`sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
Video](https://imagen.research.google/video/paper.pdf) paper).
thresholding (`bool`, defaults to `False`):
Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
as Stable Diffusion.
dynamic_thresholding_ratio (`float`, defaults to `0.995`):
dynamic_thresholding_ratio (`float`, defaults to 0.995):
The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
sample_max_value (`float`, defaults to `1.0`):
sample_max_value (`float`, defaults to 1.0):
The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
timestep_spacing (`"linspace"`, `"leading"`, or `"trailing"`, defaults to `"leading"`):
timestep_spacing (`str`, defaults to `"leading"`):
The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
steps_offset (`int`, defaults to `0`):
steps_offset (`int`, defaults to 0):
An offset added to the inference steps, as required by some model families.
rescale_betas_zero_snr (`bool`, defaults to `False`):
Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
@@ -182,18 +183,16 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
num_train_timesteps: int = 1000,
beta_start: float = 0.0001,
beta_end: float = 0.02,
beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2", "sigmoid"] = "linear",
beta_schedule: str = "linear",
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
variance_type: Literal[
"fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"
] = "fixed_small",
variance_type: str = "fixed_small",
clip_sample: bool = True,
prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
prediction_type: str = "epsilon",
thresholding: bool = False,
dynamic_thresholding_ratio: float = 0.995,
clip_sample_range: float = 1.0,
sample_max_value: float = 1.0,
timestep_spacing: Literal["linspace", "leading", "trailing"] = "leading",
timestep_spacing: str = "leading",
steps_offset: int = 0,
rescale_betas_zero_snr: bool = False,
):
@@ -323,31 +322,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
self.timesteps = torch.from_numpy(timesteps).to(device)
def _get_variance(
self,
t: int,
predicted_variance: Optional[torch.Tensor] = None,
variance_type: Optional[
Literal["fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"]
] = None,
) -> torch.Tensor:
"""
Compute the variance for a given timestep according to the specified variance type.
Args:
t (`int`):
The current timestep.
predicted_variance (`torch.Tensor`, *optional*):
The predicted variance from the model. Used only when `variance_type` is `"learned"` or
`"learned_range"`.
variance_type (`"fixed_small"`, `"fixed_small_log"`, `"fixed_large"`, `"fixed_large_log"`, `"learned"`, or `"learned_range"`, *optional*):
The type of variance to compute. If `None`, uses the variance type specified in the scheduler
configuration.
Returns:
`torch.Tensor`:
The computed variance.
"""
def _get_variance(self, t, predicted_variance=None, variance_type=None):
prev_t = self.previous_timestep(t)
alpha_prod_t = self.alphas_cumprod[t]
@@ -389,8 +364,6 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
"""
Apply dynamic thresholding to the predicted sample.
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -398,14 +371,6 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
photorealism as well as better image-text alignment, especially when using very large guidance weights."
https://huggingface.co/papers/2205.11487
Args:
sample (`torch.Tensor`):
The predicted sample to be thresholded.
Returns:
`torch.Tensor`:
The thresholded sample.
"""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
@@ -435,7 +400,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
model_output: torch.Tensor,
timestep: int,
sample: torch.Tensor,
generator: Optional[torch.Generator] = None,
generator=None,
return_dict: bool = True,
) -> Union[DDPMSchedulerOutput, Tuple]:
"""
@@ -445,19 +410,20 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
Args:
model_output (`torch.Tensor`):
The direct output from learned diffusion model.
timestep (`int`):
timestep (`float`):
The current discrete timestep in the diffusion chain.
sample (`torch.Tensor`):
A current instance of a sample created by the diffusion process.
generator (`torch.Generator`, *optional*):
A random number generator.
return_dict (`bool`, defaults to `True`):
return_dict (`bool`, *optional*, defaults to `True`):
Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`.
Returns:
[`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`:
If return_dict is `True`, [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] is returned, otherwise a
tuple is returned where the first element is the sample tensor.
"""
t = timestep
@@ -538,22 +504,6 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.IntTensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
diffusion process).
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise to add to the samples.
timesteps (`torch.IntTensor`):
The timesteps indicating the noise level for each sample.
Returns:
`torch.Tensor`:
The noisy samples.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
# for the subsequent add_noise calls
@@ -575,21 +525,6 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
return noisy_samples
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
"""
Compute the velocity prediction from the sample and noise according to the velocity formula.
Args:
sample (`torch.Tensor`):
The input sample.
noise (`torch.Tensor`):
The noise tensor.
timesteps (`torch.IntTensor`):
The timesteps for velocity computation.
Returns:
`torch.Tensor`:
The computed velocity.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as sample
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
@@ -608,21 +543,10 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
return velocity
def __len__(self) -> int:
def __len__(self):
return self.config.num_train_timesteps
def previous_timestep(self, timestep: int) -> int:
"""
Compute the previous timestep in the diffusion chain.
Args:
timestep (`int`):
The current timestep.
Returns:
`int`:
The previous timestep.
"""
def previous_timestep(self, timestep):
if self.custom_timesteps or self.num_inference_steps:
index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
if index == self.timesteps.shape[0] - 1:
@@ -16,7 +16,7 @@
import math
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -48,10 +48,10 @@ class DDPMParallelSchedulerOutput(BaseOutput):
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -59,17 +59,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -97,13 +96,13 @@ def rescale_zero_terminal_snr(betas):
"""
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
Args:
betas (`torch.Tensor`):
The betas that the scheduler is being initialized with.
the betas that the scheduler is being initialized with.
Returns:
`torch.Tensor`:
Rescaled betas with zero terminal SNR.
`torch.Tensor`: rescaled betas with zero terminal SNR
"""
# Convert betas to alphas_bar_sqrt
alphas = 1.0 - betas
@@ -192,18 +191,16 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
num_train_timesteps: int = 1000,
beta_start: float = 0.0001,
beta_end: float = 0.02,
beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2", "sigmoid"] = "linear",
beta_schedule: str = "linear",
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
variance_type: Literal[
"fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"
] = "fixed_small",
variance_type: str = "fixed_small",
clip_sample: bool = True,
prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
prediction_type: str = "epsilon",
thresholding: bool = False,
dynamic_thresholding_ratio: float = 0.995,
clip_sample_range: float = 1.0,
sample_max_value: float = 1.0,
timestep_spacing: Literal["linspace", "leading", "trailing"] = "leading",
timestep_spacing: str = "leading",
steps_offset: int = 0,
rescale_betas_zero_snr: bool = False,
):
@@ -336,31 +333,7 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
self.timesteps = torch.from_numpy(timesteps).to(device)
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._get_variance
def _get_variance(
self,
t: int,
predicted_variance: Optional[torch.Tensor] = None,
variance_type: Optional[
Literal["fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"]
] = None,
) -> torch.Tensor:
"""
Compute the variance for a given timestep according to the specified variance type.
Args:
t (`int`):
The current timestep.
predicted_variance (`torch.Tensor`, *optional*):
The predicted variance from the model. Used only when `variance_type` is `"learned"` or
`"learned_range"`.
variance_type (`"fixed_small"`, `"fixed_small_log"`, `"fixed_large"`, `"fixed_large_log"`, `"learned"`, or `"learned_range"`, *optional*):
The type of variance to compute. If `None`, uses the variance type specified in the scheduler
configuration.
Returns:
`torch.Tensor`:
The computed variance.
"""
def _get_variance(self, t, predicted_variance=None, variance_type=None):
prev_t = self.previous_timestep(t)
alpha_prod_t = self.alphas_cumprod[t]
@@ -403,8 +376,6 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
"""
Apply dynamic thresholding to the predicted sample.
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -412,14 +383,6 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
photorealism as well as better image-text alignment, especially when using very large guidance weights."
https://huggingface.co/papers/2205.11487
Args:
sample (`torch.Tensor`):
The predicted sample to be thresholded.
Returns:
`torch.Tensor`:
The thresholded sample.
"""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
@@ -630,22 +593,6 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.IntTensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
diffusion process).
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise to add to the samples.
timesteps (`torch.IntTensor`):
The timesteps indicating the noise level for each sample.
Returns:
`torch.Tensor`:
The noisy samples.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
# for the subsequent add_noise calls
@@ -668,21 +615,6 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
"""
Compute the velocity prediction from the sample and noise according to the velocity formula.
Args:
sample (`torch.Tensor`):
The input sample.
noise (`torch.Tensor`):
The noise tensor.
timesteps (`torch.IntTensor`):
The timesteps for velocity computation.
Returns:
`torch.Tensor`:
The computed velocity.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as sample
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
@@ -706,17 +638,6 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
def previous_timestep(self, timestep):
"""
Compute the previous timestep in the diffusion chain.
Args:
timestep (`int`):
The current timestep.
Returns:
`int`:
The previous timestep.
"""
if self.custom_timesteps or self.num_inference_steps:
index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
if index == self.timesteps.shape[0] - 1:
@@ -16,7 +16,7 @@
# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
import math
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -32,10 +32,10 @@ if is_scipy_available():
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -43,17 +43,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -230,7 +229,7 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -321,8 +320,6 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
"""
Apply dynamic thresholding to the predicted sample.
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -330,14 +327,6 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
photorealism as well as better image-text alignment, especially when using very large guidance weights."
https://huggingface.co/papers/2205.11487
Args:
sample (`torch.Tensor`):
The predicted sample to be thresholded.
Returns:
`torch.Tensor`:
The thresholded sample.
"""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
@@ -364,19 +353,6 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
def _sigma_to_t(self, sigma, log_sigmas):
"""
Convert sigma values to corresponding timestep values through interpolation.
Args:
sigma (`np.ndarray`):
The sigma value(s) to convert to timestep(s).
log_sigmas (`np.ndarray`):
The logarithm of the sigma schedule used for interpolation.
Returns:
`np.ndarray`:
The interpolated timestep value(s) corresponding to the input sigma(s).
"""
# get log sigma
log_sigma = np.log(np.maximum(sigma, 1e-10))
@@ -412,20 +388,7 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
"""
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
Models](https://huggingface.co/papers/2206.00364).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following the Karras noise schedule.
"""
"""Constructs the noise schedule of Karras et al. (2022)."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -451,19 +414,7 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
"""
Construct an exponential noise schedule.
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following an exponential schedule.
"""
"""Constructs an exponential noise schedule."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -487,24 +438,7 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
def _convert_to_beta(
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
) -> torch.Tensor:
"""
Construct a beta noise schedule as proposed in [Beta Sampling is All You
Need](https://huggingface.co/papers/2407.12173).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
alpha (`float`, *optional*, defaults to `0.6`):
The alpha parameter for the beta distribution.
beta (`float`, *optional*, defaults to `0.6`):
The beta parameter for the beta distribution.
Returns:
`torch.Tensor`:
The converted sigma values following a beta distribution schedule.
"""
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -18,7 +18,7 @@
import math
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -50,10 +50,10 @@ class DDIMSchedulerOutput(BaseOutput):
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -61,17 +61,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -446,22 +445,6 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.IntTensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
diffusion process).
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise to add to the samples.
timesteps (`torch.IntTensor`):
The timesteps indicating the noise level for each sample.
Returns:
`torch.Tensor`:
The noisy samples.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
# for the subsequent add_noise calls
@@ -484,21 +467,6 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
"""
Compute the velocity prediction from the sample and noise according to the velocity formula.
Args:
sample (`torch.Tensor`):
The input sample.
noise (`torch.Tensor`):
The noise tensor.
timesteps (`torch.IntTensor`):
The timesteps for velocity computation.
Returns:
`torch.Tensor`:
The computed velocity.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as sample
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
@@ -15,7 +15,7 @@
# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
import math
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -32,10 +32,10 @@ if is_scipy_available():
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -43,17 +43,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -81,13 +80,13 @@ def rescale_zero_terminal_snr(betas):
"""
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
Args:
betas (`torch.Tensor`):
The betas that the scheduler is being initialized with.
the betas that the scheduler is being initialized with.
Returns:
`torch.Tensor`:
Rescaled betas with zero terminal SNR.
`torch.Tensor`: rescaled betas with zero terminal SNR
"""
# Convert betas to alphas_bar_sqrt
alphas = 1.0 - betas
@@ -324,7 +323,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -461,8 +460,6 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
"""
Apply dynamic thresholding to the predicted sample.
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -470,14 +467,6 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
photorealism as well as better image-text alignment, especially when using very large guidance weights."
https://huggingface.co/papers/2205.11487
Args:
sample (`torch.Tensor`):
The predicted sample to be thresholded.
Returns:
`torch.Tensor`:
The thresholded sample.
"""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
@@ -504,19 +493,6 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
def _sigma_to_t(self, sigma, log_sigmas):
"""
Convert sigma values to corresponding timestep values through interpolation.
Args:
sigma (`np.ndarray`):
The sigma value(s) to convert to timestep(s).
log_sigmas (`np.ndarray`):
The logarithm of the sigma schedule used for interpolation.
Returns:
`np.ndarray`:
The interpolated timestep value(s) corresponding to the input sigma(s).
"""
# get log sigma
log_sigma = np.log(np.maximum(sigma, 1e-10))
@@ -551,20 +527,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
"""
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
Models](https://huggingface.co/papers/2206.00364).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following the Karras noise schedule.
"""
"""Constructs the noise schedule of Karras et al. (2022)."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -603,19 +566,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
"""
Construct an exponential noise schedule.
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following an exponential schedule.
"""
"""Constructs an exponential noise schedule."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -639,24 +590,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
def _convert_to_beta(
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
) -> torch.Tensor:
"""
Construct a beta noise schedule as proposed in [Beta Sampling is All You
Need](https://huggingface.co/papers/2407.12173).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
alpha (`float`, *optional*, defaults to `0.6`):
The alpha parameter for the beta distribution.
beta (`float`, *optional*, defaults to `0.6`):
The beta parameter for the beta distribution.
Returns:
`torch.Tensor`:
The converted sigma values following a beta distribution schedule.
"""
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -15,7 +15,7 @@
# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
import math
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -32,10 +32,10 @@ if is_scipy_available():
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -43,17 +43,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -333,8 +332,6 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
"""
Apply dynamic thresholding to the predicted sample.
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -342,14 +339,6 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
photorealism as well as better image-text alignment, especially when using very large guidance weights."
https://huggingface.co/papers/2205.11487
Args:
sample (`torch.Tensor`):
The predicted sample to be thresholded.
Returns:
`torch.Tensor`:
The thresholded sample.
"""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
@@ -376,19 +365,6 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
def _sigma_to_t(self, sigma, log_sigmas):
"""
Convert sigma values to corresponding timestep values through interpolation.
Args:
sigma (`np.ndarray`):
The sigma value(s) to convert to timestep(s).
log_sigmas (`np.ndarray`):
The logarithm of the sigma schedule used for interpolation.
Returns:
`np.ndarray`:
The interpolated timestep value(s) corresponding to the input sigma(s).
"""
# get log sigma
log_sigma = np.log(np.maximum(sigma, 1e-10))
@@ -424,20 +400,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
"""
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
Models](https://huggingface.co/papers/2206.00364).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following the Karras noise schedule.
"""
"""Constructs the noise schedule of Karras et al. (2022)."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -463,19 +426,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
"""
Construct an exponential noise schedule.
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following an exponential schedule.
"""
"""Constructs an exponential noise schedule."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -499,24 +450,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
def _convert_to_beta(
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
) -> torch.Tensor:
"""
Construct a beta noise schedule as proposed in [Beta Sampling is All You
Need](https://huggingface.co/papers/2407.12173).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
alpha (`float`, *optional*, defaults to `0.6`):
The alpha parameter for the beta distribution.
beta (`float`, *optional*, defaults to `0.6`):
The beta parameter for the beta distribution.
Returns:
`torch.Tensor`:
The converted sigma values following a beta distribution schedule.
"""
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -14,7 +14,7 @@
import math
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -115,10 +115,10 @@ class BrownianTreeNoiseSampler:
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -126,17 +126,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -251,23 +250,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
def index_for_timestep(
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
) -> int:
"""
Find the index of a given timestep in the timestep schedule.
Args:
timestep (`float` or `torch.Tensor`):
The timestep value to find in the schedule.
schedule_timesteps (`torch.Tensor`, *optional*):
The timestep schedule to search in. If `None`, uses `self.timesteps`.
Returns:
`int`:
The index of the timestep in the schedule. For the very first step, returns the second index if
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
"""
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
@@ -282,14 +265,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
return indices[pos].item()
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
"""
Initialize the step index for the scheduler based on the given timestep.
Args:
timestep (`float` or `torch.Tensor`):
The current timestep to initialize the step index from.
"""
def _init_step_index(self, timestep):
if self.begin_index is None:
if isinstance(timestep, torch.Tensor):
timestep = timestep.to(self.timesteps.device)
@@ -325,7 +301,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -453,19 +429,6 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
def _sigma_to_t(self, sigma, log_sigmas):
"""
Convert sigma values to corresponding timestep values through interpolation.
Args:
sigma (`np.ndarray`):
The sigma value(s) to convert to timestep(s).
log_sigmas (`np.ndarray`):
The logarithm of the sigma schedule used for interpolation.
Returns:
`np.ndarray`:
The interpolated timestep value(s) corresponding to the input sigma(s).
"""
# get log sigma
log_sigma = np.log(np.maximum(sigma, 1e-10))
@@ -504,19 +467,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
"""
Construct an exponential noise schedule.
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following an exponential schedule.
"""
"""Constructs an exponential noise schedule."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -540,24 +491,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
def _convert_to_beta(
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
) -> torch.Tensor:
"""
Construct a beta noise schedule as proposed in [Beta Sampling is All You
Need](https://huggingface.co/papers/2407.12173).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
alpha (`float`, *optional*, defaults to `0.6`):
The alpha parameter for the beta distribution.
beta (`float`, *optional*, defaults to `0.6`):
The beta parameter for the beta distribution.
Returns:
`torch.Tensor`:
The converted sigma values following a beta distribution schedule.
"""
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -711,21 +645,6 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.Tensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise schedule at the specified timesteps.
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise tensor to add to the original samples.
timesteps (`torch.Tensor`):
The timesteps at which to add noise, determining the noise level from the schedule.
Returns:
`torch.Tensor`:
The noisy samples with added noise scaled according to the timestep schedule.
"""
# Make sure sigmas and timesteps have the same device and dtype as original_samples
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
@@ -15,7 +15,7 @@
# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
import math
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -34,10 +34,10 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -45,17 +45,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -295,7 +294,7 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -411,8 +410,6 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
"""
Apply dynamic thresholding to the predicted sample.
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -420,14 +417,6 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
photorealism as well as better image-text alignment, especially when using very large guidance weights."
https://huggingface.co/papers/2205.11487
Args:
sample (`torch.Tensor`):
The predicted sample to be thresholded.
Returns:
`torch.Tensor`:
The thresholded sample.
"""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
@@ -454,19 +443,6 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
def _sigma_to_t(self, sigma, log_sigmas):
"""
Convert sigma values to corresponding timestep values through interpolation.
Args:
sigma (`np.ndarray`):
The sigma value(s) to convert to timestep(s).
log_sigmas (`np.ndarray`):
The logarithm of the sigma schedule used for interpolation.
Returns:
`np.ndarray`:
The interpolated timestep value(s) corresponding to the input sigma(s).
"""
# get log sigma
log_sigma = np.log(np.maximum(sigma, 1e-10))
@@ -502,20 +478,7 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
"""
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
Models](https://huggingface.co/papers/2206.00364).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following the Karras noise schedule.
"""
"""Constructs the noise schedule of Karras et al. (2022)."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -541,19 +504,7 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
"""
Construct an exponential noise schedule.
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following an exponential schedule.
"""
"""Constructs an exponential noise schedule."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -577,24 +528,7 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
def _convert_to_beta(
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
) -> torch.Tensor:
"""
Construct a beta noise schedule as proposed in [Beta Sampling is All You
Need](https://huggingface.co/papers/2407.12173).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
alpha (`float`, *optional*, defaults to `0.6`):
The alpha parameter for the beta distribution.
beta (`float`, *optional*, defaults to `0.6`):
The beta parameter for the beta distribution.
Returns:
`torch.Tensor`:
The converted sigma values following a beta distribution schedule.
"""
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -169,7 +169,7 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -299,8 +299,6 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
"""
Apply dynamic thresholding to the predicted sample.
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -308,14 +306,6 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
photorealism as well as better image-text alignment, especially when using very large guidance weights."
https://huggingface.co/papers/2205.11487
Args:
sample (`torch.Tensor`):
The predicted sample to be thresholded.
Returns:
`torch.Tensor`:
The thresholded sample.
"""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
@@ -342,19 +332,6 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
def _sigma_to_t(self, sigma, log_sigmas):
"""
Convert sigma values to corresponding timestep values through interpolation.
Args:
sigma (`np.ndarray`):
The sigma value(s) to convert to timestep(s).
log_sigmas (`np.ndarray`):
The logarithm of the sigma schedule used for interpolation.
Returns:
`np.ndarray`:
The interpolated timestep value(s) corresponding to the input sigma(s).
"""
# get log sigma
log_sigma = np.log(np.maximum(sigma, 1e-10))
@@ -695,21 +672,6 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.Tensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise schedule at the specified timesteps.
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise tensor to add to the original samples.
timesteps (`torch.Tensor`):
The timesteps at which to add noise, determining the noise level from the schedule.
Returns:
`torch.Tensor`:
The noisy samples with added noise scaled according to the timestep schedule.
"""
# Make sure sigmas and timesteps have the same device and dtype as original_samples
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
@@ -155,7 +155,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -284,23 +284,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
return sigmas
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
def index_for_timestep(
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
) -> int:
"""
Find the index of a given timestep in the timestep schedule.
Args:
timestep (`float` or `torch.Tensor`):
The timestep value to find in the schedule.
schedule_timesteps (`torch.Tensor`, *optional*):
The timestep schedule to search in. If `None`, uses `self.timesteps`.
Returns:
`int`:
The index of the timestep in the schedule. For the very first step, returns the second index if
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
"""
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
@@ -315,14 +299,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
return indices[pos].item()
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
"""
Initialize the step index for the scheduler based on the given timestep.
Args:
timestep (`float` or `torch.Tensor`):
The current timestep to initialize the step index from.
"""
def _init_step_index(self, timestep):
if self.begin_index is None:
if isinstance(timestep, torch.Tensor):
timestep = timestep.to(self.timesteps.device)
@@ -436,21 +413,6 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.Tensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise schedule at the specified timesteps.
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise tensor to add to the original samples.
timesteps (`torch.Tensor`):
The timesteps at which to add noise, determining the noise level from the schedule.
Returns:
`torch.Tensor`:
The noisy samples with added noise scaled according to the timestep schedule.
"""
# Make sure sigmas and timesteps have the same device and dtype as original_samples
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
@@ -14,7 +14,7 @@
import math
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -49,10 +49,10 @@ class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -60,17 +60,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -98,13 +97,13 @@ def rescale_zero_terminal_snr(betas):
"""
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
Args:
betas (`torch.Tensor`):
The betas that the scheduler is being initialized with.
the betas that the scheduler is being initialized with.
Returns:
`torch.Tensor`:
Rescaled betas with zero terminal SNR.
`torch.Tensor`: rescaled betas with zero terminal SNR
"""
# Convert betas to alphas_bar_sqrt
alphas = 1.0 - betas
@@ -246,7 +245,7 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -320,23 +319,7 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
def index_for_timestep(
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
) -> int:
"""
Find the index of a given timestep in the timestep schedule.
Args:
timestep (`float` or `torch.Tensor`):
The timestep value to find in the schedule.
schedule_timesteps (`torch.Tensor`, *optional*):
The timestep schedule to search in. If `None`, uses `self.timesteps`.
Returns:
`int`:
The index of the timestep in the schedule. For the very first step, returns the second index if
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
"""
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
@@ -351,14 +334,7 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
return indices[pos].item()
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
"""
Initialize the step index for the scheduler based on the given timestep.
Args:
timestep (`float` or `torch.Tensor`):
The current timestep to initialize the step index from.
"""
def _init_step_index(self, timestep):
if self.begin_index is None:
if isinstance(timestep, torch.Tensor):
timestep = timestep.to(self.timesteps.device)
@@ -475,21 +451,6 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.Tensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise schedule at the specified timesteps.
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise tensor to add to the original samples.
timesteps (`torch.Tensor`):
The timesteps at which to add noise, determining the noise level from the schedule.
Returns:
`torch.Tensor`:
The noisy samples with added noise scaled according to the timestep schedule.
"""
# Make sure sigmas and timesteps have the same device and dtype as original_samples
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
@@ -14,7 +14,7 @@
import math
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -52,10 +52,10 @@ class EulerDiscreteSchedulerOutput(BaseOutput):
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -63,17 +63,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -97,17 +96,17 @@ def betas_for_alpha_bar(
# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
def rescale_zero_terminal_snr(betas):
"""
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
Args:
betas (`torch.Tensor`):
The betas that the scheduler is being initialized with.
the betas that the scheduler is being initialized with.
Returns:
`torch.Tensor`:
Rescaled betas with zero terminal SNR.
`torch.Tensor`: rescaled betas with zero terminal SNR
"""
# Convert betas to alphas_bar_sqrt
alphas = 1.0 - betas
@@ -147,17 +146,17 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
The starting `beta` value of inference.
beta_end (`float`, defaults to 0.02):
The final `beta` value.
beta_schedule (`Literal["linear", "scaled_linear", "squaredcos_cap_v2"]`, defaults to `"linear"`):
beta_schedule (`str`, defaults to `"linear"`):
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
`"linear"`, `"scaled_linear"`, or `"squaredcos_cap_v2"`.
`linear` or `scaled_linear`.
trained_betas (`np.ndarray`, *optional*):
Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
prediction_type (`Literal["epsilon", "sample", "v_prediction"]`, defaults to `"epsilon"`, *optional*):
Prediction type of the scheduler function; can be `"epsilon"` (predicts the noise of the diffusion
process), `"sample"` (directly predicts the noisy sample`) or `"v_prediction"` (see section 2.4 of [Imagen
prediction_type (`str`, defaults to `epsilon`, *optional*):
Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
`sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
Video](https://imagen.research.google/video/paper.pdf) paper).
interpolation_type (`Literal["linear", "log_linear"]`, defaults to `"linear"`, *optional*):
The interpolation type to compute intermediate sigmas for the scheduler denoising steps. Should be one of
interpolation_type(`str`, defaults to `"linear"`, *optional*):
The interpolation type to compute intermediate sigmas for the scheduler denoising steps. Should be on of
`"linear"` or `"log_linear"`.
use_karras_sigmas (`bool`, *optional*, defaults to `False`):
Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
@@ -167,26 +166,18 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
use_beta_sigmas (`bool`, *optional*, defaults to `False`):
Whether to use beta sigmas for step sizes in the noise schedule during the sampling process. Refer to [Beta
Sampling is All You Need](https://huggingface.co/papers/2407.12173) for more information.
sigma_min (`float`, *optional*):
The minimum sigma value for the noise schedule. If not provided, defaults to the last sigma in the
schedule.
sigma_max (`float`, *optional*):
The maximum sigma value for the noise schedule. If not provided, defaults to the first sigma in the
schedule.
timestep_spacing (`Literal["linspace", "leading", "trailing"]`, defaults to `"linspace"`):
timestep_spacing (`str`, defaults to `"linspace"`):
The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
timestep_type (`Literal["discrete", "continuous"]`, defaults to `"discrete"`):
The type of timesteps to use. Can be `"discrete"` or `"continuous"`.
steps_offset (`int`, defaults to 0):
An offset added to the inference steps, as required by some model families.
rescale_betas_zero_snr (`bool`, defaults to `False`):
Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
dark samples instead of limiting it to samples with medium brightness. Loosely related to
[`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
final_sigmas_type (`Literal["zero", "sigma_min"]`, defaults to `"zero"`):
final_sigmas_type (`str`, defaults to `"zero"`):
The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
sigma is the same as the last sigma in the training schedule. If `"zero"`, the final sigma is set to 0.
sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
"""
_compatibles = [e.name for e in KarrasDiffusionSchedulers]
@@ -198,20 +189,20 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
num_train_timesteps: int = 1000,
beta_start: float = 0.0001,
beta_end: float = 0.02,
beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
beta_schedule: str = "linear",
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
interpolation_type: Literal["linear", "log_linear"] = "linear",
prediction_type: str = "epsilon",
interpolation_type: str = "linear",
use_karras_sigmas: Optional[bool] = False,
use_exponential_sigmas: Optional[bool] = False,
use_beta_sigmas: Optional[bool] = False,
sigma_min: Optional[float] = None,
sigma_max: Optional[float] = None,
timestep_spacing: Literal["linspace", "leading", "trailing"] = "linspace",
timestep_type: Literal["discrete", "continuous"] = "discrete",
timestep_spacing: str = "linspace",
timestep_type: str = "discrete", # can be "discrete" or "continuous"
steps_offset: int = 0,
rescale_betas_zero_snr: bool = False,
final_sigmas_type: Literal["zero", "sigma_min"] = "zero",
final_sigmas_type: str = "zero", # can be "zero" or "sigma_min"
):
if self.config.use_beta_sigmas and not is_scipy_available():
raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
@@ -268,15 +259,8 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication
@property
def init_noise_sigma(self) -> Union[float, torch.Tensor]:
"""
The standard deviation of the initial noise distribution.
Returns:
`float` or `torch.Tensor`:
The standard deviation of the initial noise distribution, computed based on the maximum sigma value and
the timestep spacing configuration.
"""
def init_noise_sigma(self):
# standard deviation of the initial noise distribution
max_sigma = max(self.sigmas) if isinstance(self.sigmas, list) else self.sigmas.max()
if self.config.timestep_spacing in ["linspace", "trailing"]:
return max_sigma
@@ -284,34 +268,26 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
return (max_sigma**2 + 1) ** 0.5
@property
def step_index(self) -> Optional[int]:
def step_index(self):
"""
The index counter for current timestep. It will increase by 1 after each scheduler step.
Returns:
`int` or `None`:
The current step index, or `None` if not initialized.
The index counter for current timestep. It will increase 1 after each scheduler step.
"""
return self._step_index
@property
def begin_index(self) -> Optional[int]:
def begin_index(self):
"""
The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
Returns:
`int` or `None`:
The begin index for the scheduler, or `None` if not set.
"""
return self._begin_index
# Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
def set_begin_index(self, begin_index: int = 0) -> None:
def set_begin_index(self, begin_index: int = 0):
"""
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -323,13 +299,13 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
Args:
sample (`torch.Tensor`):
The input sample to be scaled.
timestep (`float` or `torch.Tensor`):
The input sample.
timestep (`int`, *optional*):
The current timestep in the diffusion chain.
Returns:
`torch.Tensor`:
A scaled input sample, divided by `(sigma**2 + 1) ** 0.5`.
A scaled input sample.
"""
if self.step_index is None:
self._init_step_index(timestep)
@@ -342,18 +318,17 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
def set_timesteps(
self,
num_inference_steps: Optional[int] = None,
device: Optional[Union[str, torch.device]] = None,
num_inference_steps: int = None,
device: Union[str, torch.device] = None,
timesteps: Optional[List[int]] = None,
sigmas: Optional[List[float]] = None,
) -> None:
):
"""
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
Args:
num_inference_steps (`int`, *optional*):
The number of diffusion steps used when generating samples with a pre-trained model. If `None`,
`timesteps` or `sigmas` must be provided.
num_inference_steps (`int`):
The number of diffusion steps used when generating samples with a pre-trained model.
device (`str` or `torch.device`, *optional*):
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
timesteps (`List[int]`, *optional*):
@@ -361,9 +336,10 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
based on the `timestep_spacing` attribute. If `timesteps` is passed, `num_inference_steps` and `sigmas`
must be `None`, and `timestep_spacing` attribute will be ignored.
sigmas (`List[float]`, *optional*):
Custom sigmas used to support arbitrary timesteps schedule. If `None`, timesteps and sigmas will be
generated based on the relevant scheduler attributes. If `sigmas` is passed, `num_inference_steps` and
`timesteps` must be `None`, and the timesteps will be generated based on the custom sigmas schedule.
Custom sigmas used to support arbitrary timesteps schedule schedule. If `None`, timesteps and sigmas
will be generated based on the relevant scheduler attributes. If `sigmas` is passed,
`num_inference_steps` and `timesteps` must be `None`, and the timesteps will be generated based on the
custom sigmas schedule.
"""
if timesteps is not None and sigmas is not None:
@@ -473,20 +449,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
self._begin_index = None
self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication
def _sigma_to_t(self, sigma: np.ndarray, log_sigmas: np.ndarray) -> np.ndarray:
"""
Convert sigma values to corresponding timestep values through interpolation.
Args:
sigma (`np.ndarray`):
The sigma value(s) to convert to timestep(s).
log_sigmas (`np.ndarray`):
The logarithm of the sigma schedule used for interpolation.
Returns:
`np.ndarray`:
The interpolated timestep value(s) corresponding to the input sigma(s).
"""
def _sigma_to_t(self, sigma, log_sigmas):
# get log sigma
log_sigma = np.log(np.maximum(sigma, 1e-10))
@@ -510,21 +473,8 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
return t
# Copied from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
"""
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
Models](https://huggingface.co/papers/2206.00364).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following the Karras noise schedule.
"""
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
"""Constructs the noise schedule of Karras et al. (2022)."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -550,19 +500,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
# Copied from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L26
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
"""
Construct an exponential noise schedule.
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following an exponential schedule.
"""
"""Constructs an exponential noise schedule."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -585,24 +523,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
def _convert_to_beta(
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
) -> torch.Tensor:
"""
Construct a beta noise schedule as proposed in [Beta Sampling is All You
Need](https://huggingface.co/papers/2407.12173).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
alpha (`float`, *optional*, defaults to `0.6`):
The alpha parameter for the beta distribution.
beta (`float`, *optional*, defaults to `0.6`):
The beta parameter for the beta distribution.
Returns:
`torch.Tensor`:
The converted sigma values following a beta distribution schedule.
"""
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -630,23 +551,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
)
return sigmas
def index_for_timestep(
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
) -> int:
"""
Find the index of a given timestep in the timestep schedule.
Args:
timestep (`float` or `torch.Tensor`):
The timestep value to find in the schedule.
schedule_timesteps (`torch.Tensor`, *optional*):
The timestep schedule to search in. If `None`, uses `self.timesteps`.
Returns:
`int`:
The index of the timestep in the schedule. For the very first step, returns the second index if
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
"""
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
@@ -660,14 +565,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
return indices[pos].item()
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
"""
Initialize the step index for the scheduler based on the given timestep.
Args:
timestep (`float` or `torch.Tensor`):
The current timestep to initialize the step index from.
"""
def _init_step_index(self, timestep):
if self.begin_index is None:
if isinstance(timestep, torch.Tensor):
timestep = timestep.to(self.timesteps.device)
@@ -693,33 +591,26 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
Args:
model_output (`torch.Tensor`):
The direct output from the learned diffusion model.
timestep (`float` or `torch.Tensor`):
The direct output from learned diffusion model.
timestep (`float`):
The current discrete timestep in the diffusion chain.
sample (`torch.Tensor`):
A current instance of a sample created by the diffusion process.
s_churn (`float`, *optional*, defaults to `0.0`):
Stochasticity parameter that controls the amount of noise added during sampling. Higher values increase
randomness.
s_tmin (`float`, *optional*, defaults to `0.0`):
Minimum timestep threshold for applying stochasticity. Only timesteps above this value will have noise
added.
s_tmax (`float`, *optional*, defaults to `inf`):
Maximum timestep threshold for applying stochasticity. Only timesteps below this value will have noise
added.
s_noise (`float`, *optional*, defaults to `1.0`):
s_churn (`float`):
s_tmin (`float`):
s_tmax (`float`):
s_noise (`float`, defaults to 1.0):
Scaling factor for noise added to the sample.
generator (`torch.Generator`, *optional*):
A random number generator for reproducible sampling.
return_dict (`bool`, *optional*, defaults to `True`):
A random number generator.
return_dict (`bool`):
Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
tuple.
Returns:
[`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
If `return_dict` is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
returned, otherwise a tuple is returned where the first element is the sample tensor and the second
element is the predicted original sample.
If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
returned, otherwise a tuple is returned where the first element is the sample tensor.
"""
if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
@@ -798,21 +689,6 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.Tensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise schedule at the specified timesteps.
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise tensor to add to the original samples.
timesteps (`torch.Tensor`):
The timesteps at which to add noise, determining the noise level from the schedule.
Returns:
`torch.Tensor`:
The noisy samples with added noise scaled according to the timestep schedule.
"""
# Make sure sigmas and timesteps have the same device and dtype as original_samples
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
@@ -841,24 +717,6 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
return noisy_samples
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor:
"""
Compute the velocity prediction for the given sample and noise at the specified timesteps.
This method implements the velocity prediction used in v-prediction models, which predicts a linear combination
of the sample and noise.
Args:
sample (`torch.Tensor`):
The input sample for which to compute the velocity.
noise (`torch.Tensor`):
The noise tensor corresponding to the sample.
timesteps (`torch.Tensor`):
The timesteps at which to compute the velocity.
Returns:
`torch.Tensor`:
The velocity prediction computed as `sqrt(alpha_prod) * noise - sqrt(1 - alpha_prod) * sample`.
"""
if (
isinstance(timesteps, int)
or isinstance(timesteps, torch.IntTensor)
@@ -895,5 +753,5 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
return velocity
def __len__(self) -> int:
def __len__(self):
return self.config.num_train_timesteps
@@ -160,7 +160,7 @@ class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -473,20 +473,7 @@ class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
"""
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
Models](https://huggingface.co/papers/2206.00364).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following the Karras noise schedule.
"""
"""Constructs the noise schedule of Karras et al. (2022)."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -512,19 +499,7 @@ class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
"""
Construct an exponential noise schedule.
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following an exponential schedule.
"""
"""Constructs an exponential noise schedule."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -548,24 +523,7 @@ class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
def _convert_to_beta(
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
) -> torch.Tensor:
"""
Construct a beta noise schedule as proposed in [Beta Sampling is All You
Need](https://huggingface.co/papers/2407.12173).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
alpha (`float`, *optional*, defaults to `0.6`):
The alpha parameter for the beta distribution.
beta (`float`, *optional*, defaults to `0.6`):
The beta parameter for the beta distribution.
Returns:
`torch.Tensor`:
The converted sigma values following a beta distribution schedule.
"""
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -102,7 +102,7 @@ class FlowMatchHeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -168,7 +168,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -473,20 +473,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
"""
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
Models](https://huggingface.co/papers/2206.00364).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following the Karras noise schedule.
"""
"""Constructs the noise schedule of Karras et al. (2022)."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -512,19 +499,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
"""
Construct an exponential noise schedule.
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following an exponential schedule.
"""
"""Constructs an exponential noise schedule."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -548,24 +523,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
def _convert_to_beta(
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
) -> torch.Tensor:
"""
Construct a beta noise schedule as proposed in [Beta Sampling is All You
Need](https://huggingface.co/papers/2407.12173).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
alpha (`float`, *optional*, defaults to `0.6`):
The alpha parameter for the beta distribution.
beta (`float`, *optional*, defaults to `0.6`):
The beta parameter for the beta distribution.
Returns:
`torch.Tensor`:
The converted sigma values following a beta distribution schedule.
"""
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -14,7 +14,7 @@
import math
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -49,10 +49,10 @@ class HeunDiscreteSchedulerOutput(BaseOutput):
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -60,17 +60,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -188,23 +187,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
def index_for_timestep(
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
) -> int:
"""
Find the index of a given timestep in the timestep schedule.
Args:
timestep (`float` or `torch.Tensor`):
The timestep value to find in the schedule.
schedule_timesteps (`torch.Tensor`, *optional*):
The timestep schedule to search in. If `None`, uses `self.timesteps`.
Returns:
`int`:
The index of the timestep in the schedule. For the very first step, returns the second index if
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
"""
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
@@ -246,7 +229,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -371,19 +354,6 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
def _sigma_to_t(self, sigma, log_sigmas):
"""
Convert sigma values to corresponding timestep values through interpolation.
Args:
sigma (`np.ndarray`):
The sigma value(s) to convert to timestep(s).
log_sigmas (`np.ndarray`):
The logarithm of the sigma schedule used for interpolation.
Returns:
`np.ndarray`:
The interpolated timestep value(s) corresponding to the input sigma(s).
"""
# get log sigma
log_sigma = np.log(np.maximum(sigma, 1e-10))
@@ -408,20 +378,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
"""
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
Models](https://huggingface.co/papers/2206.00364).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following the Karras noise schedule.
"""
"""Constructs the noise schedule of Karras et al. (2022)."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -447,19 +404,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
"""
Construct an exponential noise schedule.
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following an exponential schedule.
"""
"""Constructs an exponential noise schedule."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -483,24 +428,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
def _convert_to_beta(
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
) -> torch.Tensor:
"""
Construct a beta noise schedule as proposed in [Beta Sampling is All You
Need](https://huggingface.co/papers/2407.12173).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
alpha (`float`, *optional*, defaults to `0.6`):
The alpha parameter for the beta distribution.
beta (`float`, *optional*, defaults to `0.6`):
The beta parameter for the beta distribution.
Returns:
`torch.Tensor`:
The converted sigma values following a beta distribution schedule.
"""
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -533,14 +461,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
return self.dt is None
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
"""
Initialize the step index for the scheduler based on the given timestep.
Args:
timestep (`float` or `torch.Tensor`):
The current timestep to initialize the step index from.
"""
def _init_step_index(self, timestep):
if self.begin_index is None:
if isinstance(timestep, torch.Tensor):
timestep = timestep.to(self.timesteps.device)
@@ -658,21 +579,6 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.Tensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise schedule at the specified timesteps.
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise tensor to add to the original samples.
timesteps (`torch.Tensor`):
The timesteps at which to add noise, determining the noise level from the schedule.
Returns:
`torch.Tensor`:
The noisy samples with added noise scaled according to the timestep schedule.
"""
# Make sure sigmas and timesteps have the same device and dtype as original_samples
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+3 -26
View File
@@ -78,7 +78,7 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -112,23 +112,7 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
self._begin_index = None
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
def index_for_timestep(
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
) -> int:
"""
Find the index of a given timestep in the timestep schedule.
Args:
timestep (`float` or `torch.Tensor`):
The timestep value to find in the schedule.
schedule_timesteps (`torch.Tensor`, *optional*):
The timestep schedule to search in. If `None`, uses `self.timesteps`.
Returns:
`int`:
The index of the timestep in the schedule. For the very first step, returns the second index if
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
"""
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
@@ -143,14 +127,7 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
return indices[pos].item()
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
"""
Initialize the step index for the scheduler based on the given timestep.
Args:
timestep (`float` or `torch.Tensor`):
The current timestep to initialize the step index from.
"""
def _init_step_index(self, timestep):
if self.begin_index is None:
if isinstance(timestep, torch.Tensor):
timestep = timestep.to(self.timesteps.device)
@@ -14,7 +14,7 @@
import math
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -50,10 +50,10 @@ class KDPM2AncestralDiscreteSchedulerOutput(BaseOutput):
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -61,17 +61,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -207,7 +206,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -343,19 +342,6 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
def _sigma_to_t(self, sigma, log_sigmas):
"""
Convert sigma values to corresponding timestep values through interpolation.
Args:
sigma (`np.ndarray`):
The sigma value(s) to convert to timestep(s).
log_sigmas (`np.ndarray`):
The logarithm of the sigma schedule used for interpolation.
Returns:
`np.ndarray`:
The interpolated timestep value(s) corresponding to the input sigma(s).
"""
# get log sigma
log_sigma = np.log(np.maximum(sigma, 1e-10))
@@ -380,20 +366,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
"""
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
Models](https://huggingface.co/papers/2206.00364).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following the Karras noise schedule.
"""
"""Constructs the noise schedule of Karras et al. (2022)."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -419,19 +392,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
"""
Construct an exponential noise schedule.
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following an exponential schedule.
"""
"""Constructs an exponential noise schedule."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -455,24 +416,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
def _convert_to_beta(
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
) -> torch.Tensor:
"""
Construct a beta noise schedule as proposed in [Beta Sampling is All You
Need](https://huggingface.co/papers/2407.12173).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
alpha (`float`, *optional*, defaults to `0.6`):
The alpha parameter for the beta distribution.
beta (`float`, *optional*, defaults to `0.6`):
The beta parameter for the beta distribution.
Returns:
`torch.Tensor`:
The converted sigma values following a beta distribution schedule.
"""
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -505,23 +449,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
return self.sample is None
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
def index_for_timestep(
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
) -> int:
"""
Find the index of a given timestep in the timestep schedule.
Args:
timestep (`float` or `torch.Tensor`):
The timestep value to find in the schedule.
schedule_timesteps (`torch.Tensor`, *optional*):
The timestep schedule to search in. If `None`, uses `self.timesteps`.
Returns:
`int`:
The index of the timestep in the schedule. For the very first step, returns the second index if
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
"""
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
@@ -536,14 +464,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
return indices[pos].item()
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
"""
Initialize the step index for the scheduler based on the given timestep.
Args:
timestep (`float` or `torch.Tensor`):
The current timestep to initialize the step index from.
"""
def _init_step_index(self, timestep):
if self.begin_index is None:
if isinstance(timestep, torch.Tensor):
timestep = timestep.to(self.timesteps.device)
@@ -665,21 +586,6 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.Tensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise schedule at the specified timesteps.
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise tensor to add to the original samples.
timesteps (`torch.Tensor`):
The timesteps at which to add noise, determining the noise level from the schedule.
Returns:
`torch.Tensor`:
The noisy samples with added noise scaled according to the timestep schedule.
"""
# Make sure sigmas and timesteps have the same device and dtype as original_samples
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
@@ -14,7 +14,7 @@
import math
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -49,10 +49,10 @@ class KDPM2DiscreteSchedulerOutput(BaseOutput):
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -60,17 +60,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -207,7 +206,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -331,23 +330,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
return self.sample is None
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
def index_for_timestep(
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
) -> int:
"""
Find the index of a given timestep in the timestep schedule.
Args:
timestep (`float` or `torch.Tensor`):
The timestep value to find in the schedule.
schedule_timesteps (`torch.Tensor`, *optional*):
The timestep schedule to search in. If `None`, uses `self.timesteps`.
Returns:
`int`:
The index of the timestep in the schedule. For the very first step, returns the second index if
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
"""
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
@@ -362,14 +345,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
return indices[pos].item()
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
"""
Initialize the step index for the scheduler based on the given timestep.
Args:
timestep (`float` or `torch.Tensor`):
The current timestep to initialize the step index from.
"""
def _init_step_index(self, timestep):
if self.begin_index is None:
if isinstance(timestep, torch.Tensor):
timestep = timestep.to(self.timesteps.device)
@@ -379,19 +355,6 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
def _sigma_to_t(self, sigma, log_sigmas):
"""
Convert sigma values to corresponding timestep values through interpolation.
Args:
sigma (`np.ndarray`):
The sigma value(s) to convert to timestep(s).
log_sigmas (`np.ndarray`):
The logarithm of the sigma schedule used for interpolation.
Returns:
`np.ndarray`:
The interpolated timestep value(s) corresponding to the input sigma(s).
"""
# get log sigma
log_sigma = np.log(np.maximum(sigma, 1e-10))
@@ -416,20 +379,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
"""
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
Models](https://huggingface.co/papers/2206.00364).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following the Karras noise schedule.
"""
"""Constructs the noise schedule of Karras et al. (2022)."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -455,19 +405,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
"""
Construct an exponential noise schedule.
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following an exponential schedule.
"""
"""Constructs an exponential noise schedule."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -491,24 +429,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
def _convert_to_beta(
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
) -> torch.Tensor:
"""
Construct a beta noise schedule as proposed in [Beta Sampling is All You
Need](https://huggingface.co/papers/2407.12173).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
alpha (`float`, *optional*, defaults to `0.6`):
The alpha parameter for the beta distribution.
beta (`float`, *optional*, defaults to `0.6`):
The beta parameter for the beta distribution.
Returns:
`torch.Tensor`:
The converted sigma values following a beta distribution schedule.
"""
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -637,21 +558,6 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.Tensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise schedule at the specified timesteps.
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise tensor to add to the original samples.
timesteps (`torch.Tensor`):
The timesteps at which to add noise, determining the noise level from the schedule.
Returns:
`torch.Tensor`:
The noisy samples with added noise scaled according to the timestep schedule.
"""
# Make sure sigmas and timesteps have the same device and dtype as original_samples
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+18 -94
View File
@@ -17,7 +17,7 @@
import math
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -51,10 +51,10 @@ class LCMSchedulerOutput(BaseOutput):
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -62,17 +62,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -100,13 +99,13 @@ def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
"""
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
Args:
betas (`torch.Tensor`):
The betas that the scheduler is being initialized with.
the betas that the scheduler is being initialized with.
Returns:
`torch.Tensor`:
Rescaled betas with zero terminal SNR.
`torch.Tensor`: rescaled betas with zero terminal SNR
"""
# Convert betas to alphas_bar_sqrt
alphas = 1.0 - betas
@@ -252,23 +251,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
self._begin_index = None
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
def index_for_timestep(
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
) -> int:
"""
Find the index of a given timestep in the timestep schedule.
Args:
timestep (`float` or `torch.Tensor`):
The timestep value to find in the schedule.
schedule_timesteps (`torch.Tensor`, *optional*):
The timestep schedule to search in. If `None`, uses `self.timesteps`.
Returns:
`int`:
The index of the timestep in the schedule. For the very first step, returns the second index if
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
"""
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
@@ -283,14 +266,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
return indices[pos].item()
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
"""
Initialize the step index for the scheduler based on the given timestep.
Args:
timestep (`float` or `torch.Tensor`):
The current timestep to initialize the step index from.
"""
def _init_step_index(self, timestep):
if self.begin_index is None:
if isinstance(timestep, torch.Tensor):
timestep = timestep.to(self.timesteps.device)
@@ -315,7 +291,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -339,8 +315,6 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
"""
Apply dynamic thresholding to the predicted sample.
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -348,14 +322,6 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
photorealism as well as better image-text alignment, especially when using very large guidance weights."
https://huggingface.co/papers/2205.11487
Args:
sample (`torch.Tensor`):
The predicted sample to be thresholded.
Returns:
`torch.Tensor`:
The thresholded sample.
"""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
@@ -631,22 +597,6 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.IntTensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
diffusion process).
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise to add to the samples.
timesteps (`torch.IntTensor`):
The timesteps indicating the noise level for each sample.
Returns:
`torch.Tensor`:
The noisy samples.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
# for the subsequent add_noise calls
@@ -669,21 +619,6 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
"""
Compute the velocity prediction from the sample and noise according to the velocity formula.
Args:
sample (`torch.Tensor`):
The input sample.
noise (`torch.Tensor`):
The noise tensor.
timesteps (`torch.IntTensor`):
The timesteps for velocity computation.
Returns:
`torch.Tensor`:
The computed velocity.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as sample
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
@@ -707,17 +642,6 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
def previous_timestep(self, timestep):
"""
Compute the previous timestep in the diffusion chain.
Args:
timestep (`int`):
The current timestep.
Returns:
`int`:
The previous timestep.
"""
if self.custom_timesteps or self.num_inference_steps:
index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
if index == self.timesteps.shape[0] - 1:
@@ -14,7 +14,7 @@
import math
import warnings
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import scipy.stats
@@ -47,10 +47,10 @@ class LMSDiscreteSchedulerOutput(BaseOutput):
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -58,17 +58,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -210,7 +209,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -320,23 +319,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
self.derivatives = []
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
def index_for_timestep(
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
) -> int:
"""
Find the index of a given timestep in the timestep schedule.
Args:
timestep (`float` or `torch.Tensor`):
The timestep value to find in the schedule.
schedule_timesteps (`torch.Tensor`, *optional*):
The timestep schedule to search in. If `None`, uses `self.timesteps`.
Returns:
`int`:
The index of the timestep in the schedule. For the very first step, returns the second index if
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
"""
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
@@ -351,14 +334,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
return indices[pos].item()
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
"""
Initialize the step index for the scheduler based on the given timestep.
Args:
timestep (`float` or `torch.Tensor`):
The current timestep to initialize the step index from.
"""
def _init_step_index(self, timestep):
if self.begin_index is None:
if isinstance(timestep, torch.Tensor):
timestep = timestep.to(self.timesteps.device)
@@ -368,19 +344,6 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
def _sigma_to_t(self, sigma, log_sigmas):
"""
Convert sigma values to corresponding timestep values through interpolation.
Args:
sigma (`np.ndarray`):
The sigma value(s) to convert to timestep(s).
log_sigmas (`np.ndarray`):
The logarithm of the sigma schedule used for interpolation.
Returns:
`np.ndarray`:
The interpolated timestep value(s) corresponding to the input sigma(s).
"""
# get log sigma
log_sigma = np.log(np.maximum(sigma, 1e-10))
@@ -419,19 +382,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
"""
Construct an exponential noise schedule.
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following an exponential schedule.
"""
"""Constructs an exponential noise schedule."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -455,24 +406,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
def _convert_to_beta(
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
) -> torch.Tensor:
"""
Construct a beta noise schedule as proposed in [Beta Sampling is All You
Need](https://huggingface.co/papers/2407.12173).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
alpha (`float`, *optional*, defaults to `0.6`):
The alpha parameter for the beta distribution.
beta (`float`, *optional*, defaults to `0.6`):
The beta parameter for the beta distribution.
Returns:
`torch.Tensor`:
The converted sigma values following a beta distribution schedule.
"""
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -587,21 +521,6 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.Tensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise schedule at the specified timesteps.
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise tensor to add to the original samples.
timesteps (`torch.Tensor`):
The timesteps at which to add noise, determining the noise level from the schedule.
Returns:
`torch.Tensor`:
The noisy samples with added noise scaled according to the timestep schedule.
"""
# Make sure sigmas and timesteps have the same device and dtype as original_samples
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
+12 -29
View File
@@ -15,7 +15,7 @@
# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
import math
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -26,10 +26,10 @@ from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, Schedul
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -37,17 +37,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -453,22 +452,6 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.IntTensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
diffusion process).
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise to add to the samples.
timesteps (`torch.IntTensor`):
The timesteps indicating the noise level for each sample.
Returns:
`torch.Tensor`:
The noisy samples.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
# for the subsequent add_noise calls
+12 -13
View File
@@ -14,7 +14,7 @@
import math
from dataclasses import dataclass
from typing import Literal, Optional, Tuple, Union
from typing import Optional, Tuple, Union
import numpy as np
import torch
@@ -45,10 +45,10 @@ class RePaintSchedulerOutput(BaseOutput):
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -56,17 +56,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
+16 -98
View File
@@ -16,7 +16,7 @@
# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
import math
from typing import Callable, List, Literal, Optional, Tuple, Union
from typing import Callable, List, Optional, Tuple, Union
import numpy as np
import torch
@@ -33,10 +33,10 @@ if is_scipy_available():
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -44,17 +44,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -254,7 +253,7 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -343,8 +342,6 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
"""
Apply dynamic thresholding to the predicted sample.
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -352,14 +349,6 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
photorealism as well as better image-text alignment, especially when using very large guidance weights."
https://huggingface.co/papers/2205.11487
Args:
sample (`torch.Tensor`):
The predicted sample to be thresholded.
Returns:
`torch.Tensor`:
The thresholded sample.
"""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
@@ -386,19 +375,6 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
def _sigma_to_t(self, sigma, log_sigmas):
"""
Convert sigma values to corresponding timestep values through interpolation.
Args:
sigma (`np.ndarray`):
The sigma value(s) to convert to timestep(s).
log_sigmas (`np.ndarray`):
The logarithm of the sigma schedule used for interpolation.
Returns:
`np.ndarray`:
The interpolated timestep value(s) corresponding to the input sigma(s).
"""
# get log sigma
log_sigma = np.log(np.maximum(sigma, 1e-10))
@@ -434,20 +410,7 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
"""
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
Models](https://huggingface.co/papers/2206.00364).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following the Karras noise schedule.
"""
"""Constructs the noise schedule of Karras et al. (2022)."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -473,19 +436,7 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
"""
Construct an exponential noise schedule.
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following an exponential schedule.
"""
"""Constructs an exponential noise schedule."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -509,24 +460,7 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
def _convert_to_beta(
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
) -> torch.Tensor:
"""
Construct a beta noise schedule as proposed in [Beta Sampling is All You
Need](https://huggingface.co/papers/2407.12173).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
alpha (`float`, *optional*, defaults to `0.6`):
The alpha parameter for the beta distribution.
beta (`float`, *optional*, defaults to `0.6`):
The beta parameter for the beta distribution.
Returns:
`torch.Tensor`:
The converted sigma values following a beta distribution schedule.
"""
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -1259,22 +1193,6 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.IntTensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
diffusion process).
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise to add to the samples.
timesteps (`torch.IntTensor`):
The timesteps indicating the noise level for each sample.
Returns:
`torch.Tensor`:
The noisy samples.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
# for the subsequent add_noise calls
+3 -26
View File
@@ -109,7 +109,7 @@ class SCMScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -173,14 +173,7 @@ class SCMScheduler(SchedulerMixin, ConfigMixin):
self._begin_index = None
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
"""
Initialize the step index for the scheduler based on the given timestep.
Args:
timestep (`float` or `torch.Tensor`):
The current timestep to initialize the step index from.
"""
def _init_step_index(self, timestep):
if self.begin_index is None:
if isinstance(timestep, torch.Tensor):
timestep = timestep.to(self.timesteps.device)
@@ -189,23 +182,7 @@ class SCMScheduler(SchedulerMixin, ConfigMixin):
self._step_index = self._begin_index
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
def index_for_timestep(
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
) -> int:
"""
Find the index of a given timestep in the timestep schedule.
Args:
timestep (`float` or `torch.Tensor`):
The timestep value to find in the schedule.
schedule_timesteps (`torch.Tensor`, *optional*):
The timestep schedule to search in. If `None`, uses `self.timesteps`.
Returns:
`int`:
The index of the timestep in the schedule. For the very first step, returns the second index if
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
"""
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
+18 -112
View File
@@ -17,7 +17,7 @@
import math
from dataclasses import dataclass
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -50,10 +50,10 @@ class TCDSchedulerOutput(BaseOutput):
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -61,17 +61,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -99,13 +98,13 @@ def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
"""
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
Args:
betas (`torch.Tensor`):
The betas that the scheduler is being initialized with.
the betas that the scheduler is being initialized with.
Returns:
`torch.Tensor`:
Rescaled betas with zero terminal SNR.
`torch.Tensor`: rescaled betas with zero terminal SNR
"""
# Convert betas to alphas_bar_sqrt
alphas = 1.0 - betas
@@ -253,23 +252,7 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
self._begin_index = None
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
def index_for_timestep(
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
) -> int:
"""
Find the index of a given timestep in the timestep schedule.
Args:
timestep (`float` or `torch.Tensor`):
The timestep value to find in the schedule.
schedule_timesteps (`torch.Tensor`, *optional*):
The timestep schedule to search in. If `None`, uses `self.timesteps`.
Returns:
`int`:
The index of the timestep in the schedule. For the very first step, returns the second index if
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
"""
def index_for_timestep(self, timestep, schedule_timesteps=None):
if schedule_timesteps is None:
schedule_timesteps = self.timesteps
@@ -284,14 +267,7 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
return indices[pos].item()
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
"""
Initialize the step index for the scheduler based on the given timestep.
Args:
timestep (`float` or `torch.Tensor`):
The current timestep to initialize the step index from.
"""
def _init_step_index(self, timestep):
if self.begin_index is None:
if isinstance(timestep, torch.Tensor):
timestep = timestep.to(self.timesteps.device)
@@ -316,7 +292,7 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -340,24 +316,6 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler._get_variance
def _get_variance(self, timestep, prev_timestep):
"""
Computes the variance of the noise added at a given diffusion step.
For a given `timestep` and its previous step, this method calculates the variance as defined in DDIM/DDPM
literature:
var_t = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
where alpha_prod and beta_prod are cumulative products of alphas and betas, respectively.
Args:
timestep (`int`):
The current timestep in the diffusion process.
prev_timestep (`int`):
The previous timestep in the diffusion process. If negative, uses `final_alpha_cumprod`.
Returns:
`torch.Tensor`:
The variance for the current timestep.
"""
alpha_prod_t = self.alphas_cumprod[timestep]
alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
beta_prod_t = 1 - alpha_prod_t
@@ -370,8 +328,6 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
"""
Apply dynamic thresholding to the predicted sample.
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -379,14 +335,6 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
photorealism as well as better image-text alignment, especially when using very large guidance weights."
https://huggingface.co/papers/2205.11487
Args:
sample (`torch.Tensor`):
The predicted sample to be thresholded.
Returns:
`torch.Tensor`:
The thresholded sample.
"""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
@@ -686,22 +634,6 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.IntTensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
diffusion process).
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise to add to the samples.
timesteps (`torch.IntTensor`):
The timesteps indicating the noise level for each sample.
Returns:
`torch.Tensor`:
The noisy samples.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
# for the subsequent add_noise calls
@@ -724,21 +656,6 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
"""
Compute the velocity prediction from the sample and noise according to the velocity formula.
Args:
sample (`torch.Tensor`):
The input sample.
noise (`torch.Tensor`):
The noise tensor.
timesteps (`torch.IntTensor`):
The timesteps for velocity computation.
Returns:
`torch.Tensor`:
The computed velocity.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as sample
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
@@ -762,17 +679,6 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
def previous_timestep(self, timestep):
"""
Compute the previous timestep in the diffusion chain.
Args:
timestep (`int`):
The current timestep.
Returns:
`int`:
The previous timestep.
"""
if self.custom_timesteps or self.num_inference_steps:
index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
if index == self.timesteps.shape[0] - 1:
+12 -29
View File
@@ -14,7 +14,7 @@
import math
from dataclasses import dataclass
from typing import Literal, Optional, Tuple, Union
from typing import Optional, Tuple, Union
import numpy as np
import torch
@@ -46,10 +46,10 @@ class UnCLIPSchedulerOutput(BaseOutput):
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -57,17 +57,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -335,22 +334,6 @@ class UnCLIPScheduler(SchedulerMixin, ConfigMixin):
noise: torch.Tensor,
timesteps: torch.IntTensor,
) -> torch.Tensor:
"""
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
diffusion process).
Args:
original_samples (`torch.Tensor`):
The original samples to which noise will be added.
noise (`torch.Tensor`):
The noise to add to the samples.
timesteps (`torch.IntTensor`):
The timesteps indicating the noise level for each sample.
Returns:
`torch.Tensor`:
The noisy samples.
"""
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
# for the subsequent add_noise calls
@@ -16,7 +16,7 @@
# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
import math
from typing import List, Literal, Optional, Tuple, Union
from typing import List, Optional, Tuple, Union
import numpy as np
import torch
@@ -32,10 +32,10 @@ if is_scipy_available():
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
def betas_for_alpha_bar(
num_diffusion_timesteps: int,
max_beta: float = 0.999,
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
) -> torch.Tensor:
num_diffusion_timesteps,
max_beta=0.999,
alpha_transform_type="cosine",
):
"""
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
(1-beta) over time from t = [0,1].
@@ -43,17 +43,16 @@ def betas_for_alpha_bar(
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
to that part of the diffusion process.
Args:
num_diffusion_timesteps (`int`):
The number of betas to produce.
max_beta (`float`, defaults to `0.999`):
The maximum beta to use; use values lower than 1 to avoid numerical instability.
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
num_diffusion_timesteps (`int`): the number of betas to produce.
max_beta (`float`): the maximum beta to use; use values lower than 1 to
prevent singularities.
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
Choose from `cosine` or `exp`
Returns:
`torch.Tensor`:
The betas used by the scheduler to step the model outputs.
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
"""
if alpha_transform_type == "cosine":
@@ -81,13 +80,13 @@ def rescale_zero_terminal_snr(betas):
"""
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
Args:
betas (`torch.Tensor`):
The betas that the scheduler is being initialized with.
the betas that the scheduler is being initialized with.
Returns:
`torch.Tensor`:
Rescaled betas with zero terminal SNR.
`torch.Tensor`: rescaled betas with zero terminal SNR
"""
# Convert betas to alphas_bar_sqrt
alphas = 1.0 - betas
@@ -298,7 +297,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
Args:
begin_index (`int`, defaults to `0`):
begin_index (`int`):
The begin index for the scheduler.
"""
self._begin_index = begin_index
@@ -433,8 +432,6 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
"""
Apply dynamic thresholding to the predicted sample.
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
@@ -442,14 +439,6 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
photorealism as well as better image-text alignment, especially when using very large guidance weights."
https://huggingface.co/papers/2205.11487
Args:
sample (`torch.Tensor`):
The predicted sample to be thresholded.
Returns:
`torch.Tensor`:
The thresholded sample.
"""
dtype = sample.dtype
batch_size, channels, *remaining_dims = sample.shape
@@ -476,19 +465,6 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
def _sigma_to_t(self, sigma, log_sigmas):
"""
Convert sigma values to corresponding timestep values through interpolation.
Args:
sigma (`np.ndarray`):
The sigma value(s) to convert to timestep(s).
log_sigmas (`np.ndarray`):
The logarithm of the sigma schedule used for interpolation.
Returns:
`np.ndarray`:
The interpolated timestep value(s) corresponding to the input sigma(s).
"""
# get log sigma
log_sigma = np.log(np.maximum(sigma, 1e-10))
@@ -524,20 +500,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
"""
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
Models](https://huggingface.co/papers/2206.00364).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following the Karras noise schedule.
"""
"""Constructs the noise schedule of Karras et al. (2022)."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -563,19 +526,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
"""
Construct an exponential noise schedule.
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
Returns:
`torch.Tensor`:
The converted sigma values following an exponential schedule.
"""
"""Constructs an exponential noise schedule."""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
@@ -599,24 +550,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
def _convert_to_beta(
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
) -> torch.Tensor:
"""
Construct a beta noise schedule as proposed in [Beta Sampling is All You
Need](https://huggingface.co/papers/2407.12173).
Args:
in_sigmas (`torch.Tensor`):
The input sigma values to be converted.
num_inference_steps (`int`):
The number of inference steps to generate the noise schedule for.
alpha (`float`, *optional*, defaults to `0.6`):
The alpha parameter for the beta distribution.
beta (`float`, *optional*, defaults to `0.6`):
The beta parameter for the beta distribution.
Returns:
`torch.Tensor`:
The converted sigma values following a beta distribution schedule.
"""
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
# Hack to make sure that other schedulers which copy this function don't break
# TODO: Add this logic to the other schedulers
-1
View File
@@ -108,7 +108,6 @@ from .import_utils import (
is_tensorboard_available,
is_timm_available,
is_torch_available,
is_torch_mlu_available,
is_torch_npu_available,
is_torch_version,
is_torch_xla_available,
+1 -1
View File
@@ -42,7 +42,7 @@ HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(HF_HOME, "modules"
DEPRECATED_REVISION_ARGS = ["fp16", "non-ema"]
DIFFUSERS_REQUEST_TIMEOUT = 60
DIFFUSERS_ATTN_BACKEND = os.getenv("DIFFUSERS_ATTN_BACKEND", "native")
DIFFUSERS_ATTN_CHECKS = os.getenv("DIFFUSERS_ATTN_CHECKS", "0").upper() in ENV_VARS_TRUE_VALUES
DIFFUSERS_ATTN_CHECKS = os.getenv("DIFFUSERS_ATTN_CHECKS", "0") in ENV_VARS_TRUE_VALUES
DEFAULT_HF_PARALLEL_LOADING_WORKERS = 8
HF_ENABLE_PARALLEL_LOADING = os.environ.get("HF_ENABLE_PARALLEL_LOADING", "").upper() in ENV_VARS_TRUE_VALUES
DIFFUSERS_DISABLE_REMOTE_CODE = os.getenv("DIFFUSERS_DISABLE_REMOTE_CODE", "false").upper() in ENV_VARS_TRUE_VALUES
-15
View File
@@ -1623,21 +1623,6 @@ class VQModel(metaclass=DummyObject):
requires_backends(cls, ["torch"])
class WanAnimateTransformer3DModel(metaclass=DummyObject):
_backends = ["torch"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch"])
class WanTransformer3DModel(metaclass=DummyObject):
_backends = ["torch"]
@@ -3512,21 +3512,6 @@ class VQDiffusionPipeline(metaclass=DummyObject):
requires_backends(cls, ["torch", "transformers"])
class WanAnimatePipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
def __init__(self, *args, **kwargs):
requires_backends(self, ["torch", "transformers"])
@classmethod
def from_config(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
@classmethod
def from_pretrained(cls, *args, **kwargs):
requires_backends(cls, ["torch", "transformers"])
class WanImageToVideoPipeline(metaclass=DummyObject):
_backends = ["torch", "transformers"]
-5
View File
@@ -192,7 +192,6 @@ except importlib_metadata.PackageNotFoundError:
_torch_xla_available, _torch_xla_version = _is_package_available("torch_xla")
_torch_npu_available, _torch_npu_version = _is_package_available("torch_npu")
_torch_mlu_available, _torch_mlu_version = _is_package_available("torch_mlu")
_transformers_available, _transformers_version = _is_package_available("transformers")
_hf_hub_available, _hf_hub_version = _is_package_available("huggingface_hub")
_kernels_available, _kernels_version = _is_package_available("kernels")
@@ -244,10 +243,6 @@ def is_torch_npu_available():
return _torch_npu_available
def is_torch_mlu_available():
return _torch_mlu_available
def is_flax_available():
return _flax_available
+3 -5
View File
@@ -20,7 +20,7 @@ import os
from typing import Callable, Dict, List, Optional, Tuple, Union
from . import logging
from .import_utils import is_torch_available, is_torch_mlu_available, is_torch_npu_available, is_torch_version
from .import_utils import is_torch_available, is_torch_npu_available, is_torch_version
if is_torch_available():
@@ -242,8 +242,8 @@ def fourier_filter(x_in: "torch.Tensor", threshold: int, scale: int) -> "torch.T
def apply_freeu(
resolution_idx: int, hidden_states: "torch.Tensor", res_hidden_states: "torch.Tensor", **freeu_kwargs
) -> Tuple["torch.Tensor", "torch.Tensor"]:
"""Applies the FreeU mechanism as introduced in https://huggingface.co/papers/2309.11497. Adapted from the official
code repository: https://github.com/ChenyangSi/FreeU.
"""Applies the FreeU mechanism as introduced in https:
//arxiv.org/abs/2309.11497. Adapted from the official code repository: https://github.com/ChenyangSi/FreeU.
Args:
resolution_idx (`int`): Integer denoting the UNet block where FreeU is being applied.
@@ -286,8 +286,6 @@ def get_device():
return "xpu"
elif torch.backends.mps.is_available():
return "mps"
elif is_torch_mlu_available():
return "mlu"
else:
return "cpu"
+14
View File
@@ -32,6 +32,20 @@ warnings.simplefilter(action="ignore", category=FutureWarning)
def pytest_configure(config):
config.addinivalue_line("markers", "big_accelerator: marks tests as requiring big accelerator resources")
config.addinivalue_line("markers", "lora: marks tests for LoRA/PEFT functionality")
config.addinivalue_line("markers", "ip_adapter: marks tests for IP Adapter functionality")
config.addinivalue_line("markers", "training: marks tests for training functionality")
config.addinivalue_line("markers", "attention: marks tests for attention processor functionality")
config.addinivalue_line("markers", "memory: marks tests for memory optimization functionality")
config.addinivalue_line("markers", "cpu_offload: marks tests for CPU offloading functionality")
config.addinivalue_line("markers", "group_offload: marks tests for group offloading functionality")
config.addinivalue_line("markers", "compile: marks tests for torch.compile functionality")
config.addinivalue_line("markers", "single_file: marks tests for single file checkpoint loading")
config.addinivalue_line("markers", "bitsandbytes: marks tests for BitsAndBytes quantization functionality")
config.addinivalue_line("markers", "quanto: marks tests for Quanto quantization functionality")
config.addinivalue_line("markers", "torchao: marks tests for TorchAO quantization functionality")
config.addinivalue_line("markers", "gguf: marks tests for GGUF quantization functionality")
config.addinivalue_line("markers", "modelopt: marks tests for NVIDIA ModelOpt quantization functionality")
def pytest_addoption(parser):
@@ -82,7 +82,3 @@ class AutoencoderDCTests(ModelTesterMixin, AutoencoderTesterMixin, unittest.Test
@unittest.skipIf(IS_GITHUB_ACTIONS, reason="Skipping test inside GitHub Actions environment")
def test_layerwise_casting_inference(self):
super().test_layerwise_casting_inference()
@unittest.skipIf(IS_GITHUB_ACTIONS, reason="Skipping test inside GitHub Actions environment")
def test_layerwise_casting_memory(self):
super().test_layerwise_casting_memory()
+12 -12
View File
@@ -317,9 +317,9 @@ class ModelUtilsTest(unittest.TestCase):
repo_id, subfolder="transformer", cache_dir=tmpdir, local_files_only=True
)
assert all(torch.equal(p1, p2) for p1, p2 in zip(model.parameters(), local_model.parameters())), (
"Model parameters don't match!"
)
assert all(
torch.equal(p1, p2) for p1, p2 in zip(model.parameters(), local_model.parameters())
), "Model parameters don't match!"
# Remove a shard file
cached_shard_file = try_to_load_from_cache(
@@ -335,9 +335,9 @@ class ModelUtilsTest(unittest.TestCase):
# Verify error mentions the missing shard
error_msg = str(context.exception)
assert cached_shard_file in error_msg or "required according to the checkpoint index" in error_msg, (
f"Expected error about missing shard, got: {error_msg}"
)
assert (
cached_shard_file in error_msg or "required according to the checkpoint index" in error_msg
), f"Expected error about missing shard, got: {error_msg}"
@unittest.skip("Flaky behaviour on CI. Re-enable after migrating to new runners")
@unittest.skipIf(torch_device == "mps", reason="Test not supported for MPS.")
@@ -354,9 +354,9 @@ class ModelUtilsTest(unittest.TestCase):
)
download_requests = [r.method for r in m.request_history]
assert download_requests.count("HEAD") == 3, (
"3 HEAD requests one for config, one for model, and one for shard index file."
)
assert (
download_requests.count("HEAD") == 3
), "3 HEAD requests one for config, one for model, and one for shard index file."
assert download_requests.count("GET") == 2, "2 GET requests one for config, one for model"
with requests_mock.mock(real_http=True) as m:
@@ -368,9 +368,9 @@ class ModelUtilsTest(unittest.TestCase):
)
cache_requests = [r.method for r in m.request_history]
assert "HEAD" == cache_requests[0] and len(cache_requests) == 2, (
"We should call only `model_info` to check for commit hash and knowing if shard index is present."
)
assert (
"HEAD" == cache_requests[0] and len(cache_requests) == 2
), "We should call only `model_info` to check for commit hash and knowing if shard index is present."
def test_weight_overwrite(self):
with tempfile.TemporaryDirectory() as tmpdirname, self.assertRaises(ValueError) as error_context:
+37
View File
@@ -0,0 +1,37 @@
from .attention import AttentionTesterMixin
from .common import ModelTesterMixin
from .compile import TorchCompileTesterMixin
from .ip_adapter import IPAdapterTesterMixin
from .lora import LoraTesterMixin
from .memory import CPUOffloadTesterMixin, GroupOffloadTesterMixin, LayerwiseCastingTesterMixin, MemoryTesterMixin
from .quantization import (
BitsAndBytesTesterMixin,
GGUFTesterMixin,
ModelOptTesterMixin,
QuantizationTesterMixin,
QuantoTesterMixin,
TorchAoTesterMixin,
)
from .single_file import SingleFileTesterMixin
from .training import TrainingTesterMixin
__all__ = [
"AttentionTesterMixin",
"BitsAndBytesTesterMixin",
"CPUOffloadTesterMixin",
"GGUFTesterMixin",
"GroupOffloadTesterMixin",
"IPAdapterTesterMixin",
"LayerwiseCastingTesterMixin",
"LoraTesterMixin",
"MemoryTesterMixin",
"ModelOptTesterMixin",
"ModelTesterMixin",
"QuantizationTesterMixin",
"QuantoTesterMixin",
"SingleFileTesterMixin",
"TorchAoTesterMixin",
"TorchCompileTesterMixin",
"TrainingTesterMixin",
]
+180
View File
@@ -0,0 +1,180 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import pytest
import torch
from diffusers.models.attention import AttentionModuleMixin
from diffusers.models.attention_processor import (
AttnProcessor,
)
from ...testing_utils import is_attention, require_accelerator, torch_device
@is_attention
@require_accelerator
class AttentionTesterMixin:
"""
Mixin class for testing attention processor and module functionality on models.
Tests functionality from AttentionModuleMixin including:
- Attention processor management (set/get)
- QKV projection fusion/unfusion
- Attention backends (XFormers, NPU, etc.)
Expected class attributes to be set by subclasses:
- model_class: The model class to test
- base_precision: Tolerance for floating point comparisons (default: 1e-3)
- uses_custom_attn_processor: Whether model uses custom attention processors (default: False)
Expected methods to be implemented by subclasses:
- get_init_dict(): Returns dict of arguments to initialize the model
- get_dummy_inputs(): Returns dict of inputs to pass to the model forward pass
Pytest mark: attention
Use `pytest -m "not attention"` to skip these tests
"""
base_precision = 1e-3
def test_fuse_unfuse_qkv_projections(self):
init_dict = self.get_init_dict()
inputs_dict = self.get_dummy_inputs()
model = self.model_class(**init_dict)
model.to(torch_device)
model.eval()
if not hasattr(model, "fuse_qkv_projections"):
pytest.skip("Model does not support QKV projection fusion.")
# Get output before fusion
with torch.no_grad():
output_before_fusion = model(**inputs_dict)
if isinstance(output_before_fusion, dict):
output_before_fusion = output_before_fusion.to_tuple()[0]
# Fuse projections
model.fuse_qkv_projections()
# Verify fusion occurred by checking for fused attributes
has_fused_projections = False
for module in model.modules():
if isinstance(module, AttentionModuleMixin):
if hasattr(module, "to_qkv") or hasattr(module, "to_kv"):
has_fused_projections = True
assert module.fused_projections, "fused_projections flag should be True"
break
if has_fused_projections:
# Get output after fusion
with torch.no_grad():
output_after_fusion = model(**inputs_dict)
if isinstance(output_after_fusion, dict):
output_after_fusion = output_after_fusion.to_tuple()[0]
# Verify outputs match
assert torch.allclose(
output_before_fusion, output_after_fusion, atol=self.base_precision
), "Output should not change after fusing projections"
# Unfuse projections
model.unfuse_qkv_projections()
# Verify unfusion occurred
for module in model.modules():
if isinstance(module, AttentionModuleMixin):
assert not hasattr(module, "to_qkv"), "to_qkv should be removed after unfusing"
assert not hasattr(module, "to_kv"), "to_kv should be removed after unfusing"
assert not module.fused_projections, "fused_projections flag should be False"
# Get output after unfusion
with torch.no_grad():
output_after_unfusion = model(**inputs_dict)
if isinstance(output_after_unfusion, dict):
output_after_unfusion = output_after_unfusion.to_tuple()[0]
# Verify outputs still match
assert torch.allclose(
output_before_fusion, output_after_unfusion, atol=self.base_precision
), "Output should match original after unfusing projections"
def test_get_set_processor(self):
init_dict = self.get_init_dict()
model = self.model_class(**init_dict)
model.to(torch_device)
# Check if model has attention processors
if not hasattr(model, "attn_processors"):
pytest.skip("Model does not have attention processors.")
# Test getting processors
processors = model.attn_processors
assert isinstance(processors, dict), "attn_processors should return a dict"
assert len(processors) > 0, "Model should have at least one attention processor"
# Test that all processors can be retrieved via get_processor
for module in model.modules():
if isinstance(module, AttentionModuleMixin):
processor = module.get_processor()
assert processor is not None, "get_processor should return a processor"
# Test setting a new processor
new_processor = AttnProcessor()
module.set_processor(new_processor)
retrieved_processor = module.get_processor()
assert retrieved_processor is new_processor, "Retrieved processor should be the same as the one set"
def test_attention_processor_dict(self):
init_dict = self.get_init_dict()
model = self.model_class(**init_dict)
model.to(torch_device)
if not hasattr(model, "set_attn_processor"):
pytest.skip("Model does not support setting attention processors.")
# Get current processors
current_processors = model.attn_processors
# Create a dict of new processors
new_processors = {key: AttnProcessor() for key in current_processors.keys()}
# Set processors using dict
model.set_attn_processor(new_processors)
# Verify all processors were set
updated_processors = model.attn_processors
for key in current_processors.keys():
assert type(updated_processors[key]) == AttnProcessor, f"Processor {key} should be AttnProcessor"
def test_attention_processor_count_mismatch_raises_error(self):
init_dict = self.get_init_dict()
model = self.model_class(**init_dict)
model.to(torch_device)
if not hasattr(model, "set_attn_processor"):
pytest.skip("Model does not support setting attention processors.")
# Get current processors
current_processors = model.attn_processors
# Create a dict with wrong number of processors
wrong_processors = {list(current_processors.keys())[0]: AttnProcessor()}
# Verify error is raised
with pytest.raises(ValueError) as exc_info:
model.set_attn_processor(wrong_processors)
assert "number of processors" in str(exc_info.value).lower(), "Error should mention processor count mismatch"
+514
View File
@@ -0,0 +1,514 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
import tempfile
from typing import Dict, List, Tuple
import pytest
import torch
from accelerate.utils.modeling import _get_proper_dtype, compute_module_sizes, dtype_byte_size
from diffusers.utils import SAFE_WEIGHTS_INDEX_NAME, _add_variant
from diffusers.utils.testing_utils import require_accelerator, require_torch_multi_accelerator
from ...testing_utils import torch_device
def compute_module_persistent_sizes(
model: nn.Module,
dtype: Optional[Union[str, torch.device]] = None,
special_dtypes: Optional[Dict[str, Union[str, torch.device]]] = None,
):
"""
Compute the size of each submodule of a given model (parameters + persistent buffers).
"""
if dtype is not None:
dtype = _get_proper_dtype(dtype)
dtype_size = dtype_byte_size(dtype)
if special_dtypes is not None:
special_dtypes = {key: _get_proper_dtype(dtyp) for key, dtyp in special_dtypes.items()}
special_dtypes_size = {key: dtype_byte_size(dtyp) for key, dtyp in special_dtypes.items()}
module_sizes = defaultdict(int)
module_list = []
module_list = named_persistent_module_tensors(model, recurse=True)
for name, tensor in module_list:
if special_dtypes is not None and name in special_dtypes:
size = tensor.numel() * special_dtypes_size[name]
elif dtype is None:
size = tensor.numel() * dtype_byte_size(tensor.dtype)
elif str(tensor.dtype).startswith(("torch.uint", "torch.int", "torch.bool")):
# According to the code in set_module_tensor_to_device, these types won't be converted
# so use their original size here
size = tensor.numel() * dtype_byte_size(tensor.dtype)
else:
size = tensor.numel() * min(dtype_size, dtype_byte_size(tensor.dtype))
name_parts = name.split(".")
for idx in range(len(name_parts) + 1):
module_sizes[".".join(name_parts[:idx])] += size
return module_sizes
def calculate_expected_num_shards(index_map_path):
"""
Calculate expected number of shards from index file.
Args:
index_map_path: Path to the sharded checkpoint index file
Returns:
int: Expected number of shards
"""
with open(index_map_path) as f:
weight_map_dict = json.load(f)["weight_map"]
first_key = list(weight_map_dict.keys())[0]
weight_loc = weight_map_dict[first_key] # e.g., diffusion_pytorch_model-00001-of-00002.safetensors
expected_num_shards = int(weight_loc.split("-")[-1].split(".")[0])
return expected_num_shards
def check_device_map_is_respected(model, device_map):
for param_name, param in model.named_parameters():
# Find device in device_map
while len(param_name) > 0 and param_name not in device_map:
param_name = ".".join(param_name.split(".")[:-1])
if param_name not in device_map:
raise ValueError("device map is incomplete, it does not contain any device for `param_name`.")
param_device = device_map[param_name]
if param_device in ["cpu", "disk"]:
assert param.device == torch.device("meta"), f"Expected device 'meta' for {param_name}, got {param.device}"
else:
assert param.device == torch.device(
param_device
), f"Expected device {param_device} for {param_name}, got {param.device}"
class ModelTesterMixin:
"""
Base mixin class for model testing with common test methods.
Expected class attributes to be set by subclasses:
- model_class: The model class to test
- main_input_name: Name of the main input tensor (e.g., "sample", "hidden_states")
- base_precision: Default tolerance for floating point comparisons (default: 1e-3)
Expected methods to be implemented by subclasses:
- get_init_dict(): Returns dict of arguments to initialize the model
- get_dummy_inputs(): Returns dict of inputs to pass to the model forward pass
"""
model_class = None
base_precision = 1e-3
model_split_percents = [0.5, 0.7]
def get_init_dict(self):
raise NotImplementedError("get_init_dict must be implemented by subclasses. ")
def get_dummy_inputs(self):
raise NotImplementedError(
"get_dummy_inputs must be implemented by subclasses. " "It should return inputs_dict."
)
def test_from_save_pretrained(self, expected_max_diff=5e-5):
model = self.model_class(**self.get_init_dict())
model.to(torch_device)
model.eval()
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
new_model = self.model_class.from_pretrained(tmpdirname)
new_model.to(torch_device)
# check if all parameters shape are the same
for param_name in model.state_dict().keys():
param_1 = model.state_dict()[param_name]
param_2 = new_model.state_dict()[param_name]
assert (
param_1.shape == param_2.shape
), f"Parameter shape mismatch for {param_name}. Original: {param_1.shape}, loaded: {param_2.shape}"
with torch.no_grad():
image = model(**self.get_dummy_inputs())
if isinstance(image, dict):
image = image.to_tuple()[0]
new_image = new_model(**self.get_dummy_inputs())
if isinstance(new_image, dict):
new_image = new_image.to_tuple()[0]
max_diff = (image - new_image).abs().max().item()
assert (
max_diff <= expected_max_diff
), f"Models give different forward passes. Max diff: {max_diff}, expected: {expected_max_diff}"
def test_from_save_pretrained_variant(self, expected_max_diff=5e-5):
model = self.model_class(**self.get_init_dict())
model.to(torch_device)
model.eval()
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname, variant="fp16")
new_model = self.model_class.from_pretrained(tmpdirname, variant="fp16")
# non-variant cannot be loaded
with pytest.raises(OSError) as exc_info:
self.model_class.from_pretrained(tmpdirname)
# make sure that error message states what keys are missing
assert "Error no file named diffusion_pytorch_model.bin found in directory" in str(exc_info.value)
new_model.to(torch_device)
with torch.no_grad():
image = model(**self.get_dummy_inputs())
if isinstance(image, dict):
image = image.to_tuple()[0]
new_image = new_model(**self.get_dummy_inputs())
if isinstance(new_image, dict):
new_image = new_image.to_tuple()[0]
max_diff = (image - new_image).abs().max().item()
assert (
max_diff <= expected_max_diff
), f"Models give different forward passes. Max diff: {max_diff}, expected: {expected_max_diff}"
def test_from_save_pretrained_dtype(self):
model = self.model_class(**self.get_init_dict())
model.to(torch_device)
model.eval()
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
if torch_device == "mps" and dtype == torch.bfloat16:
continue
with tempfile.TemporaryDirectory() as tmpdirname:
model.to(dtype)
model.save_pretrained(tmpdirname)
new_model = self.model_class.from_pretrained(tmpdirname, low_cpu_mem_usage=True, torch_dtype=dtype)
assert new_model.dtype == dtype
if (
hasattr(self.model_class, "_keep_in_fp32_modules")
and self.model_class._keep_in_fp32_modules is None
):
# When loading without accelerate dtype == torch.float32 if _keep_in_fp32_modules is not None
new_model = self.model_class.from_pretrained(
tmpdirname, low_cpu_mem_usage=False, torch_dtype=dtype
)
assert new_model.dtype == dtype
def test_determinism(self, expected_max_diff=1e-5):
model = self.model_class(**self.get_init_dict())
model.to(torch_device)
model.eval()
with torch.no_grad():
first = model(**self.get_dummy_inputs())
if isinstance(first, dict):
first = first.to_tuple()[0]
second = model(**self.get_dummy_inputs())
if isinstance(second, dict):
second = second.to_tuple()[0]
# Remove NaN values and compute max difference
first_flat = first.flatten()
second_flat = second.flatten()
# Filter out NaN values
mask = ~(torch.isnan(first_flat) | torch.isnan(second_flat))
first_filtered = first_flat[mask]
second_filtered = second_flat[mask]
max_diff = torch.abs(first_filtered - second_filtered).max().item()
assert (
max_diff <= expected_max_diff
), f"Model outputs are not deterministic. Max diff: {max_diff}, expected: {expected_max_diff}"
def test_output(self, expected_output_shape=None):
model = self.model_class(**self.get_init_dict())
model.to(torch_device)
model.eval()
inputs_dict = self.get_dummy_inputs()
with torch.no_grad():
output = model(**inputs_dict)
if isinstance(output, dict):
output = output.to_tuple()[0]
assert output is not None, "Model output is None"
assert (
output.shape == expected_output_shape
), f"Output shape does not match expected. Expected {expected_output_shape}, got {output.shape}"
def test_outputs_equivalence(self):
def set_nan_tensor_to_zero(t):
# Temporary fallback until `aten::_index_put_impl_` is implemented in mps
# Track progress in https://github.com/pytorch/pytorch/issues/77764
device = t.device
if device.type == "mps":
t = t.to("cpu")
t[t != t] = 0
return t.to(device)
def recursive_check(tuple_object, dict_object):
if isinstance(tuple_object, (List, Tuple)):
for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
recursive_check(tuple_iterable_value, dict_iterable_value)
elif isinstance(tuple_object, Dict):
for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
recursive_check(tuple_iterable_value, dict_iterable_value)
elif tuple_object is None:
return
else:
assert torch.allclose(
set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
), (
"Tuple and dict output are not equal. Difference:"
f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
)
model = self.model_class(**self.get_init_dict())
model.to(torch_device)
model.eval()
with torch.no_grad():
outputs_dict = model(**self.get_dummy_inputs())
outputs_tuple = model(**self.get_dummy_inputs(), return_dict=False)
recursive_check(outputs_tuple, outputs_dict)
def test_model_config_to_json_string(self):
model = self.model_class(**self.get_init_dict())
json_string = model.config.to_json_string()
assert isinstance(json_string, str), "Config to_json_string should return a string"
assert len(json_string) > 0, "JSON string should not be empty"
@require_accelerator
@pytest.mark.skipif(torch_device not in ["cuda", "xpu"])
def test_from_save_pretrained_float16_bfloat16(self):
model = self.model_class(**self.get_init_dict())
model.to(torch_device)
fp32_modules = model._keep_in_fp32_modules
with tempfile.TemporaryDirectory() as tmp_dir:
for torch_dtype in [torch.bfloat16, torch.float16]:
model.to(torch_dtype).save_pretrained(tmp_dir)
model_loaded = self.model_class.from_pretrained(tmp_dir, torch_dtype=torch_dtype).to(torch_device)
for name, param in model_loaded.named_parameters():
if any(module_to_keep_in_fp32 in name.split(".") for module_to_keep_in_fp32 in fp32_modules):
assert param.data.dtype == torch.float32
else:
assert param.data.dtype == torch_dtype
with torch.no_grad():
output = model(**get_dummy_inputs())
output_loaded = model_loaded(**get_dummy_inputs())
assert torch.allclose(
output, output_loaded, atol=1e-4
), f"Loaded model output differs for {torch_dtype}"
@require_accelerator
def test_sharded_checkpoints(self):
torch.manual_seed(0)
config = self.get_init_dict()
inputs_dict = self.get_dummy_inputs()
model = self.model_class(**config).eval()
model = model.to(torch_device)
base_output = model(**inputs_dict)
model_size = compute_module_persistent_sizes(model)[""]
max_shard_size = int((model_size * 0.75) / (2**10)) # Convert to KB as these test models are small
with tempfile.TemporaryDirectory() as tmp_dir:
model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
assert os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)), "Index file should exist"
# Check if the right number of shards exists
expected_num_shards = calculate_expected_num_shards(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))
actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
assert (
actual_num_shards == expected_num_shards
), f"Expected {expected_num_shards} shards, got {actual_num_shards}"
new_model = self.model_class.from_pretrained(tmp_dir).eval()
new_model = new_model.to(torch_device)
torch.manual_seed(0)
inputs_dict_new = self.get_dummy_inputs()
new_output = new_model(**inputs_dict_new)
assert torch.allclose(
base_output[0], new_output[0], atol=1e-5
), "Output should match after sharded save/load"
@require_accelerator
def test_sharded_checkpoints_with_variant(self):
torch.manual_seed(0)
config = self.get_init_dict()
inputs_dict = self.get_dummy_inputs()
model = self.model_class(**config).eval()
model = model.to(torch_device)
base_output = model(**inputs_dict)
model_size = compute_module_persistent_sizes(model)[""]
max_shard_size = int((model_size * 0.75) / (2**10)) # Convert to KB as these test models are small
variant = "fp16"
with tempfile.TemporaryDirectory() as tmp_dir:
model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB", variant=variant)
index_filename = _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)
assert os.path.exists(
os.path.join(tmp_dir, index_filename)
), f"Variant index file {index_filename} should exist"
# Check if the right number of shards exists
expected_num_shards = calculate_expected_num_shards(os.path.join(tmp_dir, index_filename))
actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
assert (
actual_num_shards == expected_num_shards
), f"Expected {expected_num_shards} shards, got {actual_num_shards}"
new_model = self.model_class.from_pretrained(tmp_dir, variant=variant).eval()
new_model = new_model.to(torch_device)
torch.manual_seed(0)
inputs_dict_new = self.get_dummy_inputs()
new_output = new_model(**inputs_dict_new)
assert torch.allclose(
base_output[0], new_output[0], atol=1e-5
), "Output should match after variant sharded save/load"
@require_accelerator
def test_sharded_checkpoints_with_parallel_loading(self):
import time
from diffusers.utils import constants
torch.manual_seed(0)
config = self.get_init_dict()
inputs_dict = self.get_dummy_inputs()
model = self.model_class(**config).eval()
model = model.to(torch_device)
base_output = model(**inputs_dict)
model_size = compute_module_persistent_sizes(model)[""]
max_shard_size = int((model_size * 0.75) / (2**10)) # Convert to KB as these test models are small
# Save original values to restore after test
original_parallel_loading = constants.HF_ENABLE_PARALLEL_LOADING
original_parallel_workers = getattr(constants, "HF_PARALLEL_WORKERS", None)
try:
with tempfile.TemporaryDirectory() as tmp_dir:
model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
assert os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)), "Index file should exist"
# Check if the right number of shards exists
expected_num_shards = calculate_expected_num_shards(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))
actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
assert (
actual_num_shards == expected_num_shards
), f"Expected {expected_num_shards} shards, got {actual_num_shards}"
# Load without parallel loading
constants.HF_ENABLE_PARALLEL_LOADING = False
start_time = time.time()
model_sequential = self.model_class.from_pretrained(tmp_dir).eval()
sequential_load_time = time.time() - start_time
model_sequential = model_sequential.to(torch_device)
torch.manual_seed(0)
# Load with parallel loading
constants.HF_ENABLE_PARALLEL_LOADING = True
constants.DEFAULT_HF_PARALLEL_LOADING_WORKERS = 2
start_time = time.time()
model_parallel = self.model_class.from_pretrained(tmp_dir).eval()
parallel_load_time = time.time() - start_time
model_parallel = model_parallel.to(torch_device)
torch.manual_seed(0)
inputs_dict_parallel = self.get_dummy_inputs()
output_parallel = model_parallel(**inputs_dict_parallel)
assert torch.allclose(
base_output[0], output_parallel[0], atol=1e-5
), "Output should match with parallel loading"
# Verify parallel loading is faster or at least not significantly slower
# For small test models, the difference might be negligible or even slightly slower due to overhead
# so we just check that parallel loading completed successfully and outputs match
assert (
parallel_load_time < sequential_load_time
), f"Parallel loading took {parallel_load_time:.4f}s, sequential took {sequential_load_time:.4f}s"
finally:
# Restore original values
constants.HF_ENABLE_PARALLEL_LOADING = original_parallel_loading
if original_parallel_workers is not None:
constants.HF_PARALLEL_WORKERS = original_parallel_workers
@require_torch_multi_accelerator
def test_model_parallelism(self):
if self.model_class._no_split_modules is None:
pytest.skip("Test not supported for this model as `_no_split_modules` is not set.")
config = self.get_init_dict()
inputs_dict = self.get_dummy_inputs()
model = self.model_class(**config).eval()
model = model.to(torch_device)
torch.manual_seed(0)
base_output = model(**inputs_dict)
model_size = compute_module_sizes(model)[""]
max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents]
with tempfile.TemporaryDirectory() as tmp_dir:
model.cpu().save_pretrained(tmp_dir)
for max_size in max_gpu_sizes:
max_memory = {0: max_size, 1: model_size * 2, "cpu": model_size * 2}
new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
# Making sure part of the model will be on GPU 0 and GPU 1
assert set(new_model.hf_device_map.values()) == {0, 1}, "Model should be split across GPUs"
check_device_map_is_respected(new_model, new_model.hf_device_map)
torch.manual_seed(0)
new_output = new_model(**inputs_dict)
assert torch.allclose(
base_output[0], new_output[0], atol=1e-5
), "Output should match with model parallelism"
+162
View File
@@ -0,0 +1,162 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import os
import tempfile
import pytest
import torch
from ...testing_utils import (
backend_empty_cache,
is_torch_compile,
require_accelerator,
require_torch_version_greater,
torch_device,
)
@is_torch_compile
@require_accelerator
@require_torch_version_greater("2.7.1")
class TorchCompileTesterMixin:
"""
Mixin class for testing torch.compile functionality on models.
Expected class attributes to be set by subclasses:
- model_class: The model class to test
- different_shapes_for_compilation: Optional list of (height, width) tuples for dynamic shape testing
Expected methods to be implemented by subclasses:
- get_init_dict(): Returns dict of arguments to initialize the model
- get_dummy_inputs(): Returns dict of inputs to pass to the model forward pass
Pytest mark: compile
Use `pytest -m "not compile"` to skip these tests
"""
different_shapes_for_compilation = None
def setup_method(self):
torch.compiler.reset()
gc.collect()
backend_empty_cache(torch_device)
def teardown_method(self):
torch.compiler.reset()
gc.collect()
backend_empty_cache(torch_device)
def test_torch_compile_recompilation_and_graph_break(self):
init_dict = self.get_init_dict()
inputs_dict = self.get_dummy_inputs()
model = self.model_class(**init_dict).to(torch_device)
model.eval()
model = torch.compile(model, fullgraph=True)
with (
torch._inductor.utils.fresh_inductor_cache(),
torch._dynamo.config.patch(error_on_recompile=True),
torch.no_grad(),
):
_ = model(**inputs_dict)
_ = model(**inputs_dict)
def test_torch_compile_repeated_blocks(self):
if self.model_class._repeated_blocks is None:
pytest.skip("Skipping test as the model class doesn't have `_repeated_blocks` set.")
init_dict = self.get_init_dict()
inputs_dict = self.get_dummy_inputs()
model = self.model_class(**init_dict).to(torch_device)
model.eval()
model.compile_repeated_blocks(fullgraph=True)
recompile_limit = 1
if self.model_class.__name__ == "UNet2DConditionModel":
recompile_limit = 2
with (
torch._inductor.utils.fresh_inductor_cache(),
torch._dynamo.config.patch(recompile_limit=recompile_limit),
torch.no_grad(),
):
_ = model(**inputs_dict)
_ = model(**inputs_dict)
def test_compile_with_group_offloading(self):
if not self.model_class._supports_group_offloading:
pytest.skip("Model does not support group offloading.")
torch._dynamo.config.cache_size_limit = 10000
init_dict = self.get_init_dict()
inputs_dict = self.get_dummy_inputs()
model = self.model_class(**init_dict)
model.eval()
group_offload_kwargs = {
"onload_device": torch_device,
"offload_device": "cpu",
"offload_type": "block_level",
"num_blocks_per_group": 1,
"use_stream": True,
"non_blocking": True,
}
model.enable_group_offload(**group_offload_kwargs)
model.compile()
with torch.no_grad():
_ = model(**inputs_dict)
_ = model(**inputs_dict)
def test_compile_on_different_shapes(self):
if self.different_shapes_for_compilation is None:
pytest.skip(f"Skipping as `different_shapes_for_compilation` is not set for {self.__class__.__name__}.")
torch.fx.experimental._config.use_duck_shape = False
init_dict = self.get_init_dict()
model = self.model_class(**init_dict).to(torch_device)
model.eval()
model = torch.compile(model, fullgraph=True, dynamic=True)
for height, width in self.different_shapes_for_compilation:
with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
inputs_dict = self.get_dummy_inputs(height=height, width=width)
_ = model(**inputs_dict)
def test_compile_works_with_aot(self):
from torch._inductor.package import load_package
init_dict = self.get_init_dict()
inputs_dict = self.get_dummy_inputs()
model = self.model_class(**init_dict).to(torch_device)
exported_model = torch.export.export(model, args=(), kwargs=inputs_dict)
with tempfile.TemporaryDirectory() as tmpdir:
package_path = os.path.join(tmpdir, f"{self.model_class.__name__}.pt2")
_ = torch._inductor.aoti_compile_and_package(exported_model, package_path=package_path)
assert os.path.exists(package_path), f"Package file not created at {package_path}"
loaded_binary = load_package(package_path, run_single_threaded=True)
model.forward = loaded_binary
with torch.no_grad():
_ = model(**inputs_dict)
_ = model(**inputs_dict)
+109
View File
@@ -0,0 +1,109 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import tempfile
import uuid
import pytest
import torch
from huggingface_hub.utils import is_jinja_available
from ...others.test_utils import TOKEN, USER, is_staging_test
@is_staging_test
class ModelPushToHubTesterMixin:
"""
Mixin class for testing push_to_hub functionality on models.
Expected class attributes to be set by subclasses:
- model_class: The model class to test
Expected methods to be implemented by subclasses:
- get_init_dict(): Returns dict of arguments to initialize the model
"""
identifier = uuid.uuid4()
repo_id = f"test-model-{identifier}"
org_repo_id = f"valid_org/{repo_id}-org"
def test_push_to_hub(self):
"""Test pushing model to hub and loading it back."""
init_dict = self.get_init_dict()
model = self.model_class(**init_dict)
model.push_to_hub(self.repo_id, token=TOKEN)
new_model = self.model_class.from_pretrained(f"{USER}/{self.repo_id}")
for p1, p2 in zip(model.parameters(), new_model.parameters()):
assert torch.equal(p1, p2), "Parameters don't match after push_to_hub and from_pretrained"
# Reset repo
delete_repo(token=TOKEN, repo_id=self.repo_id)
# Push to hub via save_pretrained
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir, repo_id=self.repo_id, push_to_hub=True, token=TOKEN)
new_model = self.model_class.from_pretrained(f"{USER}/{self.repo_id}")
for p1, p2 in zip(model.parameters(), new_model.parameters()):
assert torch.equal(
p1, p2
), "Parameters don't match after save_pretrained with push_to_hub and from_pretrained"
# Reset repo
delete_repo(self.repo_id, token=TOKEN)
def test_push_to_hub_in_organization(self):
"""Test pushing model to hub in organization namespace."""
init_dict = self.get_init_dict()
model = self.model_class(**init_dict)
model.push_to_hub(self.org_repo_id, token=TOKEN)
new_model = self.model_class.from_pretrained(self.org_repo_id)
for p1, p2 in zip(model.parameters(), new_model.parameters()):
assert torch.equal(p1, p2), "Parameters don't match after push_to_hub to org and from_pretrained"
# Reset repo
delete_repo(token=TOKEN, repo_id=self.org_repo_id)
# Push to hub via save_pretrained
with tempfile.TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir, push_to_hub=True, token=TOKEN, repo_id=self.org_repo_id)
new_model = self.model_class.from_pretrained(self.org_repo_id)
for p1, p2 in zip(model.parameters(), new_model.parameters()):
assert torch.equal(
p1, p2
), "Parameters don't match after save_pretrained with push_to_hub to org and from_pretrained"
# Reset repo
delete_repo(self.org_repo_id, token=TOKEN)
def test_push_to_hub_library_name(self):
"""Test that library_name in model card is set to 'diffusers'."""
if not is_jinja_available():
pytest.skip("Model card tests cannot be performed without Jinja installed.")
init_dict = self.get_init_dict()
model = self.model_class(**init_dict)
model.push_to_hub(self.repo_id, token=TOKEN)
model_card = ModelCard.load(f"{USER}/{self.repo_id}", token=TOKEN).data
assert (
model_card.library_name == "diffusers"
), f"Expected library_name 'diffusers', got {model_card.library_name}"
# Reset repo
delete_repo(self.repo_id, token=TOKEN)
+205
View File
@@ -0,0 +1,205 @@
# coding=utf-8
# Copyright 2025 HuggingFace Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import tempfile
import torch
from diffusers.models.attention_processor import IPAdapterAttnProcessor
from ...testing_utils import is_ip_adapter, torch_device
def create_ip_adapter_state_dict(model):
"""
Create a dummy IP Adapter state dict for testing.
Args:
model: The model to create IP adapter weights for
Returns:
dict: IP adapter state dict with to_k_ip and to_v_ip weights
"""
ip_state_dict = {}
key_id = 1
for name in model.attn_processors.keys():
# Skip self-attention processors
cross_attention_dim = getattr(model.config, "cross_attention_dim", None)
if cross_attention_dim is None:
continue
# Get hidden size based on model architecture
hidden_size = getattr(model.config, "hidden_size", cross_attention_dim)
# Create IP adapter processor to get state dict structure
sd = IPAdapterAttnProcessor(
hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
).state_dict()
ip_state_dict.update(
{
f"{key_id}.to_k_ip.weight": sd["to_k_ip.0.weight"],
f"{key_id}.to_v_ip.weight": sd["to_v_ip.0.weight"],
}
)
key_id += 2
return {"ip_adapter": ip_state_dict}
def check_if_ip_adapter_correctly_set(model) -> bool:
"""
Check if IP Adapter processors are correctly set in the model.
Args:
model: The model to check
Returns:
bool: True if IP Adapter is correctly set, False otherwise
"""
for module in model.attn_processors.values():
if isinstance(module, IPAdapterAttnProcessor):
return True
return False
@is_ip_adapter
class IPAdapterTesterMixin:
"""
Mixin class for testing IP Adapter functionality on models.
Expected class attributes to be set by subclasses:
- model_class: The model class to test
Expected methods to be implemented by subclasses:
- get_init_dict(): Returns dict of arguments to initialize the model
- get_dummy_inputs(): Returns dict of inputs to pass to the model forward pass
Pytest mark: ip_adapter
Use `pytest -m "not ip_adapter"` to skip these tests
"""
def create_ip_adapter_state_dict(self, model):
raise NotImplementedError("child class must implement method to create IPAdapter State Dict")
def test_load_ip_adapter(self):
init_dict = self.get_init_dict()
inputs_dict = self.get_dummy_inputs()
model = self.model_class(**init_dict).to(torch_device)
torch.manual_seed(0)
output_no_adapter = model(**inputs_dict, return_dict=False)[0]
# Create dummy IP adapter state dict
ip_adapter_state_dict = self.create_ip_adapter_state_dict(model)
# Load IP adapter
model._load_ip_adapter_weights([ip_adapter_state_dict])
assert check_if_ip_adapter_correctly_set(model), "IP Adapter processors not set correctly"
torch.manual_seed(0)
# Create dummy image embeds for IP adapter
cross_attention_dim = getattr(model.config, "cross_attention_dim", 32)
image_embeds = torch.randn(1, 1, cross_attention_dim).to(torch_device)
inputs_dict_with_adapter = inputs_dict.copy()
inputs_dict_with_adapter["image_embeds"] = image_embeds
outputs_with_adapter = model(**inputs_dict_with_adapter, return_dict=False)[0]
assert not torch.allclose(
output_no_adapter, outputs_with_adapter, atol=1e-4, rtol=1e-4
), "Output should differ with IP Adapter enabled"
def test_ip_adapter_scale(self):
init_dict = self.get_init_dict()
inputs_dict = self.get_dummy_inputs()
model = self.model_class(**init_dict).to(torch_device)
# Create and load dummy IP adapter state dict
ip_adapter_state_dict = create_ip_adapter_state_dict(model)
model._load_ip_adapter_weights([ip_adapter_state_dict])
# Test scale = 0.0 (no effect)
model.set_ip_adapter_scale(0.0)
torch.manual_seed(0)
output_scale_zero = model(**inputs_dict_with_adapter, return_dict=False)[0]
# Test scale = 1.0 (full effect)
model.set_ip_adapter_scale(1.0)
torch.manual_seed(0)
output_scale_one = model(**inputs_dict_with_adapter, return_dict=False)[0]
# Outputs should differ with different scales
assert not torch.allclose(
output_scale_zero, output_scale_one, atol=1e-4, rtol=1e-4
), "Output should differ with different IP Adapter scales"
def test_unload_ip_adapter(self):
init_dict = self.get_init_dict()
model = self.model_class(**init_dict).to(torch_device)
# Save original processors
original_processors = {k: type(v).__name__ for k, v in model.attn_processors.items()}
# Create and load IP adapter
ip_adapter_state_dict = create_ip_adapter_state_dict(model)
model._load_ip_adapter_weights([ip_adapter_state_dict])
assert check_if_ip_adapter_correctly_set(model), "IP Adapter should be set"
# Unload IP adapter
model.unload_ip_adapter()
assert not check_if_ip_adapter_correctly_set(model), "IP Adapter should be unloaded"
# Verify processors are restored
current_processors = {k: type(v).__name__ for k, v in model.attn_processors.items()}
assert original_processors == current_processors, "Processors should be restored after unload"
def test_ip_adapter_save_load(self):
init_dict = self.get_init_dict()
inputs_dict = self.get_dummy_inputs()
model = self.model_class(**init_dict).to(torch_device)
# Create and load IP adapter
ip_adapter_state_dict = self.create_ip_adapter_state_dict()
model._load_ip_adapter_weights([ip_adapter_state_dict])
torch.manual_seed(0)
output_before_save = model(**inputs_dict, return_dict=False)[0]
with tempfile.TemporaryDirectory() as tmpdir:
# Save the IP adapter weights
save_path = os.path.join(tmpdir, "ip_adapter.safetensors")
import safetensors.torch
safetensors.torch.save_file(ip_adapter_state_dict["ip_adapter"], save_path)
# Unload and reload
model.unload_ip_adapter()
assert not check_if_ip_adapter_correctly_set(model), "IP Adapter should be unloaded"
# Reload from saved file
loaded_state_dict = {"ip_adapter": safetensors.torch.load_file(save_path)}
model._load_ip_adapter_weights([loaded_state_dict])
assert check_if_ip_adapter_correctly_set(model), "IP Adapter should be loaded"
torch.manual_seed(0)
output_after_load = model(**inputs_dict_with_adapter, return_dict=False)[0]
# Outputs should match before and after save/load
assert torch.allclose(
output_before_save, output_after_load, atol=1e-4, rtol=1e-4
), "Output should match before and after save/load"

Some files were not shown because too many files have changed in this diff Show More