Compare commits
23 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 0a8ec018a0 | |||
| 0c35b580fe | |||
| 01a56927f1 | |||
| a9e4883b6a | |||
| 63dd601758 | |||
| eeae0338e7 | |||
| 3c1ca869d7 | |||
| 6fe4a6ff8e | |||
| 40de88af8c | |||
| 6a2309b98d | |||
| cd3bbe2910 | |||
| 7a001c3ee2 | |||
| d8e4805816 | |||
| 44c3101685 | |||
| d6c63bb956 | |||
| 2f44d63046 | |||
| f3db38c1e7 | |||
| f5e5f34823 | |||
| 093cd3f040 | |||
| aecf0c53bf | |||
| 0c7589293b | |||
| ff263947ad | |||
| 66e6a0215f |
@@ -84,7 +84,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
-k "not Flax and not Onnx" \
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
--report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
@@ -138,7 +138,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
-k "not Flax and not Onnx" \
|
||||
--make-reports=tests_torch_${{ matrix.module }}_cuda \
|
||||
--report-log=tests_torch_${{ matrix.module }}_cuda.log \
|
||||
tests/${{ matrix.module }}
|
||||
@@ -151,7 +151,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v --make-reports=examples_torch_cuda \
|
||||
--make-reports=examples_torch_cuda \
|
||||
--report-log=examples_torch_cuda.log \
|
||||
examples/
|
||||
|
||||
@@ -198,7 +198,7 @@ jobs:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
RUN_COMPILE: yes
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "compile" --make-reports=tests_torch_compile_cuda tests/
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_compile_cuda_failures_short.txt
|
||||
@@ -293,7 +293,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
-k "not Flax and not Onnx" \
|
||||
--make-reports=tests_torch_minimum_version_cuda \
|
||||
tests/models/test_modeling_common.py \
|
||||
tests/pipelines/test_pipelines_common.py \
|
||||
@@ -531,7 +531,7 @@ jobs:
|
||||
# HF_HOME: /System/Volumes/Data/mnt/cache
|
||||
# HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
# run: |
|
||||
# ${CONDA_RUN} pytest -n 1 -s -v --make-reports=tests_torch_mps \
|
||||
# ${CONDA_RUN} pytest -n 1 --make-reports=tests_torch_mps \
|
||||
# --report-log=tests_torch_mps.log \
|
||||
# tests/
|
||||
# - name: Failure short reports
|
||||
@@ -587,7 +587,7 @@ jobs:
|
||||
# HF_HOME: /System/Volumes/Data/mnt/cache
|
||||
# HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
# run: |
|
||||
# ${CONDA_RUN} pytest -n 1 -s -v --make-reports=tests_torch_mps \
|
||||
# ${CONDA_RUN} pytest -n 1 --make-reports=tests_torch_mps \
|
||||
# --report-log=tests_torch_mps.log \
|
||||
# tests/
|
||||
# - name: Failure short reports
|
||||
|
||||
@@ -120,7 +120,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
|
||||
run: |
|
||||
pytest -n 8 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
-k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/modular_pipelines
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
|
||||
run: |
|
||||
pytest -n 8 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
-k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/pipelines
|
||||
|
||||
@@ -134,7 +134,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'pytorch_models' }}
|
||||
run: |
|
||||
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx and not Dependency" \
|
||||
-k "not Flax and not Onnx and not Dependency" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/models tests/schedulers tests/others
|
||||
|
||||
@@ -255,11 +255,11 @@ jobs:
|
||||
- name: Run fast PyTorch LoRA tests with PEFT
|
||||
run: |
|
||||
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v \
|
||||
\
|
||||
--make-reports=tests_peft_main \
|
||||
tests/lora/
|
||||
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v \
|
||||
\
|
||||
--make-reports=tests_models_lora_peft_main \
|
||||
tests/models/ -k "lora"
|
||||
|
||||
|
||||
@@ -151,13 +151,13 @@ jobs:
|
||||
run: |
|
||||
if [ "${{ matrix.module }}" = "ip_adapters" ]; then
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
-k "not Flax and not Onnx" \
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
else
|
||||
pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx and $pattern" \
|
||||
-k "not Flax and not Onnx and $pattern" \
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
fi
|
||||
@@ -222,10 +222,10 @@ jobs:
|
||||
run: |
|
||||
pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
|
||||
if [ -z "$pattern" ]; then
|
||||
pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
|
||||
--make-reports=tests_torch_cuda_${{ matrix.module }}
|
||||
else
|
||||
pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
|
||||
--make-reports=tests_torch_cuda_${{ matrix.module }}
|
||||
fi
|
||||
|
||||
@@ -274,7 +274,7 @@ jobs:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
run: |
|
||||
uv pip install ".[training]"
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile --make-reports=examples_torch_cuda examples/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
|
||||
@@ -87,7 +87,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
-k "not Flax and not Onnx" \
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
- name: Failure short reports
|
||||
@@ -141,7 +141,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
-k "not Flax and not Onnx" \
|
||||
--make-reports=tests_torch_cuda_${{ matrix.module }} \
|
||||
tests/${{ matrix.module }}
|
||||
|
||||
@@ -189,7 +189,7 @@ jobs:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
RUN_COMPILE: yes
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "compile" --make-reports=tests_torch_compile_cuda tests/
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_compile_cuda_failures_short.txt
|
||||
@@ -230,7 +230,7 @@ jobs:
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_xformers_cuda_failures_short.txt
|
||||
@@ -273,7 +273,7 @@ jobs:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
run: |
|
||||
uv pip install ".[training]"
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile --make-reports=examples_torch_cuda examples/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'pytorch' }}
|
||||
run: |
|
||||
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
-k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
|
||||
|
||||
@@ -57,7 +57,7 @@ jobs:
|
||||
HF_HOME: /System/Volumes/Data/mnt/cache
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/
|
||||
${CONDA_RUN} python -m pytest -n 0 --make-reports=tests_torch_mps tests/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
|
||||
@@ -84,7 +84,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
-k "not Flax and not Onnx" \
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
- name: Failure short reports
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
-k "not Flax and not Onnx" \
|
||||
--make-reports=tests_torch_${{ matrix.module }}_cuda \
|
||||
tests/${{ matrix.module }}
|
||||
|
||||
@@ -187,7 +187,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
-k "not Flax and not Onnx" \
|
||||
--make-reports=tests_torch_minimum_cuda \
|
||||
tests/models/test_modeling_common.py \
|
||||
tests/pipelines/test_pipelines_common.py \
|
||||
@@ -240,7 +240,7 @@ jobs:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
RUN_COMPILE: yes
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "compile" --make-reports=tests_torch_compile_cuda tests/
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_compile_cuda_failures_short.txt
|
||||
@@ -281,7 +281,7 @@ jobs:
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_xformers_cuda_failures_short.txt
|
||||
@@ -326,7 +326,7 @@ jobs:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
run: |
|
||||
uv pip install ".[training]"
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile --make-reports=examples_torch_cuda examples/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
|
||||
@@ -22,6 +22,8 @@
|
||||
title: Reproducibility
|
||||
- local: using-diffusers/schedulers
|
||||
title: Schedulers
|
||||
- local: using-diffusers/automodel
|
||||
title: AutoModel
|
||||
- local: using-diffusers/other-formats
|
||||
title: Model formats
|
||||
- local: using-diffusers/push_to_hub
|
||||
@@ -119,6 +121,8 @@
|
||||
title: ComponentsManager
|
||||
- local: modular_diffusers/guiders
|
||||
title: Guiders
|
||||
- local: modular_diffusers/custom_blocks
|
||||
title: Building Custom Blocks
|
||||
title: Modular Diffusers
|
||||
- isExpanded: false
|
||||
sections:
|
||||
@@ -387,6 +391,8 @@
|
||||
title: Transformer2DModel
|
||||
- local: api/models/transformer_temporal
|
||||
title: TransformerTemporalModel
|
||||
- local: api/models/wan_animate_transformer_3d
|
||||
title: WanAnimateTransformer3DModel
|
||||
- local: api/models/wan_transformer_3d
|
||||
title: WanTransformer3DModel
|
||||
title: Transformers
|
||||
@@ -448,6 +454,8 @@
|
||||
- sections:
|
||||
- local: api/pipelines/overview
|
||||
title: Overview
|
||||
- local: api/pipelines/auto_pipeline
|
||||
title: AutoPipeline
|
||||
- sections:
|
||||
- local: api/pipelines/audioldm
|
||||
title: AudioLDM
|
||||
@@ -460,8 +468,6 @@
|
||||
- local: api/pipelines/stable_audio
|
||||
title: Stable Audio
|
||||
title: Audio
|
||||
- local: api/pipelines/auto_pipeline
|
||||
title: AutoPipeline
|
||||
- sections:
|
||||
- local: api/pipelines/amused
|
||||
title: aMUSEd
|
||||
@@ -525,6 +531,8 @@
|
||||
title: HiDream-I1
|
||||
- local: api/pipelines/hunyuandit
|
||||
title: Hunyuan-DiT
|
||||
- local: api/pipelines/hunyuanimage21
|
||||
title: HunyuanImage2.1
|
||||
- local: api/pipelines/pix2pix
|
||||
title: InstructPix2Pix
|
||||
- local: api/pipelines/kandinsky
|
||||
@@ -638,8 +646,6 @@
|
||||
title: ConsisID
|
||||
- local: api/pipelines/framepack
|
||||
title: Framepack
|
||||
- local: api/pipelines/hunyuanimage21
|
||||
title: HunyuanImage2.1
|
||||
- local: api/pipelines/hunyuan_video
|
||||
title: HunyuanVideo
|
||||
- local: api/pipelines/i2vgenxl
|
||||
|
||||
@@ -12,15 +12,7 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# AutoModel
|
||||
|
||||
The `AutoModel` is designed to make it easy to load a checkpoint without needing to know the specific model class. `AutoModel` automatically retrieves the correct model class from the checkpoint `config.json` file.
|
||||
|
||||
```python
|
||||
from diffusers import AutoModel, AutoPipelineForText2Image
|
||||
|
||||
unet = AutoModel.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="unet")
|
||||
pipe = AutoPipelineForText2Image.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", unet=unet)
|
||||
```
|
||||
|
||||
[`AutoModel`] automatically retrieves the correct model class from the checkpoint `config.json` file.
|
||||
|
||||
## AutoModel
|
||||
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License. -->
|
||||
|
||||
# WanAnimateTransformer3DModel
|
||||
|
||||
A Diffusion Transformer model for 3D video-like data was introduced in [Wan Animate](https://github.com/Wan-Video/Wan2.2) by the Alibaba Wan Team.
|
||||
|
||||
The model can be loaded with the following code snippet.
|
||||
|
||||
```python
|
||||
from diffusers import WanAnimateTransformer3DModel
|
||||
|
||||
transformer = WanAnimateTransformer3DModel.from_pretrained("Wan-AI/Wan2.2-Animate-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
|
||||
```
|
||||
|
||||
## WanAnimateTransformer3DModel
|
||||
|
||||
[[autodoc]] WanAnimateTransformer3DModel
|
||||
|
||||
## Transformer2DModelOutput
|
||||
|
||||
[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
|
||||
@@ -40,6 +40,7 @@ The following Wan models are supported in Diffusers:
|
||||
- [Wan 2.2 T2V 14B](https://huggingface.co/Wan-AI/Wan2.2-T2V-A14B-Diffusers)
|
||||
- [Wan 2.2 I2V 14B](https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers)
|
||||
- [Wan 2.2 TI2V 5B](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B-Diffusers)
|
||||
- [Wan 2.2 Animate 14B](https://huggingface.co/Wan-AI/Wan2.2-Animate-14B-Diffusers)
|
||||
|
||||
> [!TIP]
|
||||
> Click on the Wan models in the right sidebar for more examples of video generation.
|
||||
@@ -95,15 +96,15 @@ pipeline = WanPipeline.from_pretrained(
|
||||
pipeline.to("cuda")
|
||||
|
||||
prompt = """
|
||||
The camera rushes from far to near in a low-angle shot,
|
||||
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
|
||||
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
|
||||
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
|
||||
The camera rushes from far to near in a low-angle shot,
|
||||
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
|
||||
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
|
||||
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
|
||||
shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
|
||||
"""
|
||||
negative_prompt = """
|
||||
Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality,
|
||||
low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured,
|
||||
Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality,
|
||||
low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured,
|
||||
misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards
|
||||
"""
|
||||
|
||||
@@ -150,15 +151,15 @@ pipeline.transformer = torch.compile(
|
||||
)
|
||||
|
||||
prompt = """
|
||||
The camera rushes from far to near in a low-angle shot,
|
||||
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
|
||||
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
|
||||
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
|
||||
The camera rushes from far to near in a low-angle shot,
|
||||
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
|
||||
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
|
||||
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
|
||||
shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
|
||||
"""
|
||||
negative_prompt = """
|
||||
Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality,
|
||||
low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured,
|
||||
Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality,
|
||||
low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured,
|
||||
misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards
|
||||
"""
|
||||
|
||||
@@ -249,6 +250,208 @@ The code snippets available in [this](https://github.com/huggingface/diffusers/p
|
||||
|
||||
The general rule of thumb to keep in mind when preparing inputs for the VACE pipeline is that the input images, or frames of a video that you want to use for conditioning, should have a corresponding mask that is black in color. The black mask signifies that the model will not generate new content for that area, and only use those parts for conditioning the generation process. For parts/frames that should be generated by the model, the mask should be white in color.
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### Wan-Animate: Unified Character Animation and Replacement with Holistic Replication
|
||||
|
||||
[Wan-Animate](https://huggingface.co/papers/2509.14055) by the Wan Team.
|
||||
|
||||
*We introduce Wan-Animate, a unified framework for character animation and replacement. Given a character image and a reference video, Wan-Animate can animate the character by precisely replicating the expressions and movements of the character in the video to generate high-fidelity character videos. Alternatively, it can integrate the animated character into the reference video to replace the original character, replicating the scene's lighting and color tone to achieve seamless environmental integration. Wan-Animate is built upon the Wan model. To adapt it for character animation tasks, we employ a modified input paradigm to differentiate between reference conditions and regions for generation. This design unifies multiple tasks into a common symbolic representation. We use spatially-aligned skeleton signals to replicate body motion and implicit facial features extracted from source images to reenact expressions, enabling the generation of character videos with high controllability and expressiveness. Furthermore, to enhance environmental integration during character replacement, we develop an auxiliary Relighting LoRA. This module preserves the character's appearance consistency while applying the appropriate environmental lighting and color tone. Experimental results demonstrate that Wan-Animate achieves state-of-the-art performance. We are committed to open-sourcing the model weights and its source code.*
|
||||
|
||||
The project page: https://humanaigc.github.io/wan-animate
|
||||
|
||||
This model was mostly contributed by [M. Tolga Cangöz](https://github.com/tolgacangoz).
|
||||
|
||||
#### Usage
|
||||
|
||||
The Wan-Animate pipeline supports two modes of operation:
|
||||
|
||||
1. **Animation Mode** (default): Animates a character image based on motion and expression from reference videos
|
||||
2. **Replacement Mode**: Replaces a character in a background video with a new character while preserving the scene
|
||||
|
||||
##### Prerequisites
|
||||
|
||||
Before using the pipeline, you need to preprocess your reference video to extract:
|
||||
- **Pose video**: Contains skeletal keypoints representing body motion
|
||||
- **Face video**: Contains facial feature representations for expression control
|
||||
|
||||
For replacement mode, you additionally need:
|
||||
- **Background video**: The original video containing the scene
|
||||
- **Mask video**: A mask indicating where to generate content (white) vs. preserve original (black)
|
||||
|
||||
> [!NOTE]
|
||||
> Raw videos should not be used for inputs such as `pose_video`, which the pipeline expects to be preprocessed to extract the proper information. Preprocessing scripts to prepare these inputs are available in the [original Wan-Animate repository](https://github.com/Wan-Video/Wan2.2?tab=readme-ov-file#1-preprocessing). Integration of these preprocessing steps into Diffusers is planned for a future release.
|
||||
|
||||
The example below demonstrates how to use the Wan-Animate pipeline:
|
||||
|
||||
<hfoptions id="Animate usage">
|
||||
<hfoption id="Animation mode">
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
import torch
|
||||
from diffusers import AutoencoderKLWan, WanAnimatePipeline
|
||||
from diffusers.utils import export_to_video, load_image, load_video
|
||||
|
||||
model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
|
||||
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
||||
pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
|
||||
pipe.to("cuda")
|
||||
|
||||
# Load character image and preprocessed videos
|
||||
image = load_image("path/to/character.jpg")
|
||||
pose_video = load_video("path/to/pose_video.mp4") # Preprocessed skeletal keypoints
|
||||
face_video = load_video("path/to/face_video.mp4") # Preprocessed facial features
|
||||
|
||||
# Resize image to match VAE constraints
|
||||
def aspect_ratio_resize(image, pipe, max_area=720 * 1280):
|
||||
aspect_ratio = image.height / image.width
|
||||
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
|
||||
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
|
||||
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
|
||||
image = image.resize((width, height))
|
||||
return image, height, width
|
||||
|
||||
image, height, width = aspect_ratio_resize(image, pipe)
|
||||
|
||||
prompt = "A person dancing energetically in a studio with dynamic lighting and professional camera work"
|
||||
negative_prompt = "blurry, low quality, distorted, deformed, static, poorly drawn"
|
||||
|
||||
# Generate animated video
|
||||
output = pipe(
|
||||
image=image,
|
||||
pose_video=pose_video,
|
||||
face_video=face_video,
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
height=height,
|
||||
width=width,
|
||||
segment_frame_length=77,
|
||||
guidance_scale=1.0,
|
||||
mode="animate", # Animation mode (default)
|
||||
).frames[0]
|
||||
export_to_video(output, "animated_character.mp4", fps=30)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Replacement mode">
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
import torch
|
||||
from diffusers import AutoencoderKLWan, WanAnimatePipeline
|
||||
from diffusers.utils import export_to_video, load_image, load_video
|
||||
|
||||
model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
|
||||
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
||||
pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
|
||||
pipe.to("cuda")
|
||||
|
||||
# Load all required inputs for replacement mode
|
||||
image = load_image("path/to/new_character.jpg")
|
||||
pose_video = load_video("path/to/pose_video.mp4") # Preprocessed skeletal keypoints
|
||||
face_video = load_video("path/to/face_video.mp4") # Preprocessed facial features
|
||||
background_video = load_video("path/to/background_video.mp4") # Original scene
|
||||
mask_video = load_video("path/to/mask_video.mp4") # Black: preserve, White: generate
|
||||
|
||||
# Resize image to match video dimensions
|
||||
def aspect_ratio_resize(image, pipe, max_area=720 * 1280):
|
||||
aspect_ratio = image.height / image.width
|
||||
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
|
||||
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
|
||||
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
|
||||
image = image.resize((width, height))
|
||||
return image, height, width
|
||||
|
||||
image, height, width = aspect_ratio_resize(image, pipe)
|
||||
|
||||
prompt = "A person seamlessly integrated into the scene with consistent lighting and environment"
|
||||
negative_prompt = "blurry, low quality, inconsistent lighting, floating, disconnected from scene"
|
||||
|
||||
# Replace character in background video
|
||||
output = pipe(
|
||||
image=image,
|
||||
pose_video=pose_video,
|
||||
face_video=face_video,
|
||||
background_video=background_video,
|
||||
mask_video=mask_video,
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
height=height,
|
||||
width=width,
|
||||
segment_frame_lengths=77,
|
||||
guidance_scale=1.0,
|
||||
mode="replace", # Replacement mode
|
||||
).frames[0]
|
||||
export_to_video(output, "character_replaced.mp4", fps=30)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Advanced options">
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
import torch
|
||||
from diffusers import AutoencoderKLWan, WanAnimatePipeline
|
||||
from diffusers.utils import export_to_video, load_image, load_video
|
||||
|
||||
model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
|
||||
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
||||
pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
|
||||
pipe.to("cuda")
|
||||
|
||||
image = load_image("path/to/character.jpg")
|
||||
pose_video = load_video("path/to/pose_video.mp4")
|
||||
face_video = load_video("path/to/face_video.mp4")
|
||||
|
||||
def aspect_ratio_resize(image, pipe, max_area=720 * 1280):
|
||||
aspect_ratio = image.height / image.width
|
||||
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
|
||||
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
|
||||
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
|
||||
image = image.resize((width, height))
|
||||
return image, height, width
|
||||
|
||||
image, height, width = aspect_ratio_resize(image, pipe)
|
||||
|
||||
prompt = "A person dancing energetically in a studio"
|
||||
negative_prompt = "blurry, low quality"
|
||||
|
||||
# Advanced: Use temporal guidance and custom callback
|
||||
def callback_fn(pipe, step_index, timestep, callback_kwargs):
|
||||
# You can modify latents or other tensors here
|
||||
print(f"Step {step_index}, Timestep {timestep}")
|
||||
return callback_kwargs
|
||||
|
||||
output = pipe(
|
||||
image=image,
|
||||
pose_video=pose_video,
|
||||
face_video=face_video,
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
height=height,
|
||||
width=width,
|
||||
segment_frame_length=77,
|
||||
num_inference_steps=50,
|
||||
guidance_scale=5.0,
|
||||
prev_segment_conditioning_frames=5, # Use 5 frames for temporal guidance (1 or 5 recommended)
|
||||
callback_on_step_end=callback_fn,
|
||||
callback_on_step_end_tensor_inputs=["latents"],
|
||||
).frames[0]
|
||||
export_to_video(output, "animated_advanced.mp4", fps=30)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
#### Key Parameters
|
||||
|
||||
- **mode**: Choose between `"animate"` (default) or `"replace"`
|
||||
- **prev_segment_conditioning_frames**: Number of frames for temporal guidance (1 or 5 recommended). Using 5 provides better temporal consistency but requires more memory
|
||||
- **guidance_scale**: Controls how closely the output follows the text prompt. Higher values (5-7) produce results more aligned with the prompt. For Wan-Animate, CFG is disabled by default (`guidance_scale=1.0`) but can be enabled to support negative prompts and finer control over facial expressions. (Note that CFG will only target the text prompt and face conditioning.)
|
||||
|
||||
|
||||
## Notes
|
||||
|
||||
- Wan2.1 supports LoRAs with [`~loaders.WanLoraLoaderMixin.load_lora_weights`].
|
||||
@@ -281,10 +484,10 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip
|
||||
|
||||
# use "steamboat willie style" to trigger the LoRA
|
||||
prompt = """
|
||||
steamboat willie style, golden era animation, The camera rushes from far to near in a low-angle shot,
|
||||
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
|
||||
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
|
||||
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
|
||||
steamboat willie style, golden era animation, The camera rushes from far to near in a low-angle shot,
|
||||
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
|
||||
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
|
||||
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
|
||||
shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
|
||||
"""
|
||||
|
||||
@@ -359,6 +562,12 @@ The general rule of thumb to keep in mind when preparing inputs for the VACE pip
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## WanAnimatePipeline
|
||||
|
||||
[[autodoc]] WanAnimatePipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## WanPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput
|
||||
[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput
|
||||
|
||||
@@ -0,0 +1,492 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
|
||||
# Building Custom Blocks
|
||||
|
||||
[ModularPipelineBlocks](./pipeline_block) are the fundamental building blocks of a [`ModularPipeline`]. You can create custom blocks by defining their inputs, outputs, and computation logic. This guide demonstrates how to create and use a custom block.
|
||||
|
||||
> [!TIP]
|
||||
> Explore the [Modular Diffusers Custom Blocks](https://huggingface.co/collections/diffusers/modular-diffusers-custom-blocks) collection for official custom modular blocks like Nano Banana.
|
||||
|
||||
## Project Structure
|
||||
|
||||
Your custom block project should use the following structure:
|
||||
|
||||
```shell
|
||||
.
|
||||
├── block.py
|
||||
└── modular_config.json
|
||||
```
|
||||
|
||||
- `block.py` contains the custom block implementation
|
||||
- `modular_config.json` contains the metadata needed to load the block
|
||||
|
||||
## Example: Florence 2 Inpainting Block
|
||||
|
||||
In this example we will create a custom block that uses the [Florence 2](https://huggingface.co/docs/transformers/model_doc/florence2) model to process an input image and generate a mask for inpainting.
|
||||
|
||||
The first step is to define the components that the block will use. In this case, we will need to use the `Florence2ForConditionalGeneration` model and its corresponding processor `AutoProcessor`. When defining components, we must specify the name of the component within our pipeline, model class via `type_hint`, and provide a `pretrained_model_name_or_path` for the component if we intend to load the model weights from a specific repository on the Hub.
|
||||
|
||||
```py
|
||||
# Inside block.py
|
||||
from diffusers.modular_pipelines import (
|
||||
ModularPipelineBlocks,
|
||||
ComponentSpec,
|
||||
)
|
||||
from transformers import AutoProcessor, Florence2ForConditionalGeneration
|
||||
|
||||
|
||||
class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
|
||||
|
||||
@property
|
||||
def expected_components(self):
|
||||
return [
|
||||
ComponentSpec(
|
||||
name="image_annotator",
|
||||
type_hint=Florence2ForConditionalGeneration,
|
||||
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
|
||||
),
|
||||
ComponentSpec(
|
||||
name="image_annotator_processor",
|
||||
type_hint=AutoProcessor,
|
||||
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
|
||||
),
|
||||
]
|
||||
```
|
||||
|
||||
Next, we define the inputs and outputs of the block. The inputs include the image to be annotated, the annotation task, and the annotation prompt. The outputs include the generated mask image and annotations.
|
||||
|
||||
```py
|
||||
from typing import List, Union
|
||||
from PIL import Image, ImageDraw
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from diffusers.modular_pipelines import (
|
||||
PipelineState,
|
||||
ModularPipelineBlocks,
|
||||
InputParam,
|
||||
ComponentSpec,
|
||||
OutputParam,
|
||||
)
|
||||
from transformers import AutoProcessor, Florence2ForConditionalGeneration
|
||||
|
||||
|
||||
class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
|
||||
|
||||
@property
|
||||
def expected_components(self):
|
||||
return [
|
||||
ComponentSpec(
|
||||
name="image_annotator",
|
||||
type_hint=Florence2ForConditionalGeneration,
|
||||
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
|
||||
),
|
||||
ComponentSpec(
|
||||
name="image_annotator_processor",
|
||||
type_hint=AutoProcessor,
|
||||
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
|
||||
),
|
||||
]
|
||||
|
||||
@property
|
||||
def inputs(self) -> List[InputParam]:
|
||||
return [
|
||||
InputParam(
|
||||
"image",
|
||||
type_hint=Union[Image.Image, List[Image.Image]],
|
||||
required=True,
|
||||
description="Image(s) to annotate",
|
||||
),
|
||||
InputParam(
|
||||
"annotation_task",
|
||||
type_hint=Union[str, List[str]],
|
||||
required=True,
|
||||
default="<REFERRING_EXPRESSION_SEGMENTATION>",
|
||||
description="""Annotation Task to perform on the image.
|
||||
Supported Tasks:
|
||||
|
||||
<OD>
|
||||
<REFERRING_EXPRESSION_SEGMENTATION>
|
||||
<CAPTION>
|
||||
<DETAILED_CAPTION>
|
||||
<MORE_DETAILED_CAPTION>
|
||||
<DENSE_REGION_CAPTION>
|
||||
<CAPTION_TO_PHRASE_GROUNDING>
|
||||
<OPEN_VOCABULARY_DETECTION>
|
||||
|
||||
""",
|
||||
),
|
||||
InputParam(
|
||||
"annotation_prompt",
|
||||
type_hint=Union[str, List[str]],
|
||||
required=True,
|
||||
description="""Annotation Prompt to provide more context to the task.
|
||||
Can be used to detect or segment out specific elements in the image
|
||||
""",
|
||||
),
|
||||
InputParam(
|
||||
"annotation_output_type",
|
||||
type_hint=str,
|
||||
required=True,
|
||||
default="mask_image",
|
||||
description="""Output type from annotation predictions. Availabe options are
|
||||
mask_image:
|
||||
-black and white mask image for the given image based on the task type
|
||||
mask_overlay:
|
||||
- mask overlayed on the original image
|
||||
bounding_box:
|
||||
- bounding boxes drawn on the original image
|
||||
""",
|
||||
),
|
||||
InputParam(
|
||||
"annotation_overlay",
|
||||
type_hint=bool,
|
||||
required=True,
|
||||
default=False,
|
||||
description="",
|
||||
),
|
||||
]
|
||||
|
||||
@property
|
||||
def intermediate_outputs(self) -> List[OutputParam]:
|
||||
return [
|
||||
OutputParam(
|
||||
"mask_image",
|
||||
type_hint=Image,
|
||||
description="Inpainting Mask for input Image(s)",
|
||||
),
|
||||
OutputParam(
|
||||
"annotations",
|
||||
type_hint=dict,
|
||||
description="Annotations Predictions for input Image(s)",
|
||||
),
|
||||
OutputParam(
|
||||
"image",
|
||||
type_hint=Image,
|
||||
description="Annotated input Image(s)",
|
||||
),
|
||||
]
|
||||
|
||||
```
|
||||
|
||||
Now we implement the `__call__` method, which contains the logic for processing the input image and generating the mask.
|
||||
|
||||
```py
|
||||
from typing import List, Union
|
||||
from PIL import Image, ImageDraw
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from diffusers.modular_pipelines import (
|
||||
PipelineState,
|
||||
ModularPipelineBlocks,
|
||||
InputParam,
|
||||
ComponentSpec,
|
||||
OutputParam,
|
||||
)
|
||||
from transformers import AutoProcessor, Florence2ForConditionalGeneration
|
||||
|
||||
|
||||
class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
|
||||
|
||||
@property
|
||||
def expected_components(self):
|
||||
return [
|
||||
ComponentSpec(
|
||||
name="image_annotator",
|
||||
type_hint=Florence2ForConditionalGeneration,
|
||||
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
|
||||
),
|
||||
ComponentSpec(
|
||||
name="image_annotator_processor",
|
||||
type_hint=AutoProcessor,
|
||||
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
|
||||
),
|
||||
]
|
||||
|
||||
@property
|
||||
def inputs(self) -> List[InputParam]:
|
||||
return [
|
||||
InputParam(
|
||||
"image",
|
||||
type_hint=Union[Image.Image, List[Image.Image]],
|
||||
required=True,
|
||||
description="Image(s) to annotate",
|
||||
),
|
||||
InputParam(
|
||||
"annotation_task",
|
||||
type_hint=Union[str, List[str]],
|
||||
required=True,
|
||||
default="<REFERRING_EXPRESSION_SEGMENTATION>",
|
||||
description="""Annotation Task to perform on the image.
|
||||
Supported Tasks:
|
||||
|
||||
<OD>
|
||||
<REFERRING_EXPRESSION_SEGMENTATION>
|
||||
<CAPTION>
|
||||
<DETAILED_CAPTION>
|
||||
<MORE_DETAILED_CAPTION>
|
||||
<DENSE_REGION_CAPTION>
|
||||
<CAPTION_TO_PHRASE_GROUNDING>
|
||||
<OPEN_VOCABULARY_DETECTION>
|
||||
|
||||
""",
|
||||
),
|
||||
InputParam(
|
||||
"annotation_prompt",
|
||||
type_hint=Union[str, List[str]],
|
||||
required=True,
|
||||
description="""Annotation Prompt to provide more context to the task.
|
||||
Can be used to detect or segment out specific elements in the image
|
||||
""",
|
||||
),
|
||||
InputParam(
|
||||
"annotation_output_type",
|
||||
type_hint=str,
|
||||
required=True,
|
||||
default="mask_image",
|
||||
description="""Output type from annotation predictions. Availabe options are
|
||||
mask_image:
|
||||
-black and white mask image for the given image based on the task type
|
||||
mask_overlay:
|
||||
- mask overlayed on the original image
|
||||
bounding_box:
|
||||
- bounding boxes drawn on the original image
|
||||
""",
|
||||
),
|
||||
InputParam(
|
||||
"annotation_overlay",
|
||||
type_hint=bool,
|
||||
required=True,
|
||||
default=False,
|
||||
description="",
|
||||
),
|
||||
]
|
||||
|
||||
@property
|
||||
def intermediate_outputs(self) -> List[OutputParam]:
|
||||
return [
|
||||
OutputParam(
|
||||
"mask_image",
|
||||
type_hint=Image,
|
||||
description="Inpainting Mask for input Image(s)",
|
||||
),
|
||||
OutputParam(
|
||||
"annotations",
|
||||
type_hint=dict,
|
||||
description="Annotations Predictions for input Image(s)",
|
||||
),
|
||||
OutputParam(
|
||||
"image",
|
||||
type_hint=Image,
|
||||
description="Annotated input Image(s)",
|
||||
),
|
||||
]
|
||||
|
||||
def get_annotations(self, components, images, prompts, task):
|
||||
task_prompts = [task + prompt for prompt in prompts]
|
||||
|
||||
inputs = components.image_annotator_processor(
|
||||
text=task_prompts, images=images, return_tensors="pt"
|
||||
).to(components.image_annotator.device, components.image_annotator.dtype)
|
||||
|
||||
generated_ids = components.image_annotator.generate(
|
||||
input_ids=inputs["input_ids"],
|
||||
pixel_values=inputs["pixel_values"],
|
||||
max_new_tokens=1024,
|
||||
early_stopping=False,
|
||||
do_sample=False,
|
||||
num_beams=3,
|
||||
)
|
||||
annotations = components.image_annotator_processor.batch_decode(
|
||||
generated_ids, skip_special_tokens=False
|
||||
)
|
||||
outputs = []
|
||||
for image, annotation in zip(images, annotations):
|
||||
outputs.append(
|
||||
components.image_annotator_processor.post_process_generation(
|
||||
annotation, task=task, image_size=(image.width, image.height)
|
||||
)
|
||||
)
|
||||
return outputs
|
||||
|
||||
def prepare_mask(self, images, annotations, overlay=False, fill="white"):
|
||||
masks = []
|
||||
for image, annotation in zip(images, annotations):
|
||||
mask_image = image.copy() if overlay else Image.new("L", image.size, 0)
|
||||
draw = ImageDraw.Draw(mask_image)
|
||||
|
||||
for _, _annotation in annotation.items():
|
||||
if "polygons" in _annotation:
|
||||
for polygon in _annotation["polygons"]:
|
||||
polygon = np.array(polygon).reshape(-1, 2)
|
||||
if len(polygon) < 3:
|
||||
continue
|
||||
polygon = polygon.reshape(-1).tolist()
|
||||
draw.polygon(polygon, fill=fill)
|
||||
|
||||
elif "bbox" in _annotation:
|
||||
bbox = _annotation["bbox"]
|
||||
draw.rectangle(bbox, fill="white")
|
||||
|
||||
masks.append(mask_image)
|
||||
|
||||
return masks
|
||||
|
||||
def prepare_bounding_boxes(self, images, annotations):
|
||||
outputs = []
|
||||
for image, annotation in zip(images, annotations):
|
||||
image_copy = image.copy()
|
||||
draw = ImageDraw.Draw(image_copy)
|
||||
for _, _annotation in annotation.items():
|
||||
bbox = _annotation["bbox"]
|
||||
label = _annotation["label"]
|
||||
|
||||
draw.rectangle(bbox, outline="red", width=3)
|
||||
draw.text((bbox[0], bbox[1] - 20), label, fill="red")
|
||||
|
||||
outputs.append(image_copy)
|
||||
|
||||
return outputs
|
||||
|
||||
def prepare_inputs(self, images, prompts):
|
||||
prompts = prompts or ""
|
||||
|
||||
if isinstance(images, Image.Image):
|
||||
images = [images]
|
||||
if isinstance(prompts, str):
|
||||
prompts = [prompts]
|
||||
|
||||
if len(images) != len(prompts):
|
||||
raise ValueError("Number of images and annotation prompts must match.")
|
||||
|
||||
return images, prompts
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(self, components, state: PipelineState) -> PipelineState:
|
||||
block_state = self.get_block_state(state)
|
||||
images, annotation_task_prompt = self.prepare_inputs(
|
||||
block_state.image, block_state.annotation_prompt
|
||||
)
|
||||
task = block_state.annotation_task
|
||||
fill = block_state.fill
|
||||
|
||||
annotations = self.get_annotations(
|
||||
components, images, annotation_task_prompt, task
|
||||
)
|
||||
block_state.annotations = annotations
|
||||
if block_state.annotation_output_type == "mask_image":
|
||||
block_state.mask_image = self.prepare_mask(images, annotations)
|
||||
else:
|
||||
block_state.mask_image = None
|
||||
|
||||
if block_state.annotation_output_type == "mask_overlay":
|
||||
block_state.image = self.prepare_mask(images, annotations, overlay=True, fill=fill)
|
||||
|
||||
elif block_state.annotation_output_type == "bounding_box":
|
||||
block_state.image = self.prepare_bounding_boxes(images, annotations)
|
||||
|
||||
self.set_block_state(state, block_state)
|
||||
|
||||
return components, state
|
||||
|
||||
```
|
||||
|
||||
Once we have defined our custom block, we can save it to the Hub, using either the CLI or the [`push_to_hub`] method. This will make it easy to share and reuse our custom block with other pipelines.
|
||||
|
||||
<hfoptions id="share">
|
||||
<hfoption id="hf CLI">
|
||||
|
||||
```shell
|
||||
# In the folder with the `block.py` file, run:
|
||||
diffusers-cli custom_block
|
||||
```
|
||||
|
||||
Then upload the block to the Hub:
|
||||
|
||||
```shell
|
||||
hf upload <your repo id> . .
|
||||
```
|
||||
</hfoption>
|
||||
<hfoption id="push_to_hub">
|
||||
|
||||
```py
|
||||
from block import Florence2ImageAnnotatorBlock
|
||||
block = Florence2ImageAnnotatorBlock()
|
||||
block.push_to_hub("<your repo id>")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Using Custom Blocks
|
||||
|
||||
Load the custom block with [`~ModularPipelineBlocks.from_pretrained`] and set `trust_remote_code=True`.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers.modular_pipelines import ModularPipelineBlocks, SequentialPipelineBlocks
|
||||
from diffusers.modular_pipelines.stable_diffusion_xl import INPAINT_BLOCKS
|
||||
from diffusers.utils import load_image
|
||||
|
||||
# Fetch the Florence2 image annotator block that will create our mask
|
||||
image_annotator_block = ModularPipelineBlocks.from_pretrained("diffusers/florence-2-custom-block", trust_remote_code=True)
|
||||
|
||||
my_blocks = INPAINT_BLOCKS.copy()
|
||||
# insert the annotation block before the image encoding step
|
||||
my_blocks.insert("image_annotator", image_annotator_block, 1)
|
||||
|
||||
# Create our initial set of inpainting blocks
|
||||
blocks = SequentialPipelineBlocks.from_blocks_dict(my_blocks)
|
||||
|
||||
repo_id = "diffusers/modular-stable-diffusion-xl-base-1.0"
|
||||
pipe = blocks.init_pipeline(repo_id)
|
||||
pipe.load_components(torch_dtype=torch.float16, device_map="cuda", trust_remote_code=True)
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true")
|
||||
image = image.resize((1024, 1024))
|
||||
|
||||
prompt = ["A red car"]
|
||||
annotation_task = "<REFERRING_EXPRESSION_SEGMENTATION>"
|
||||
annotation_prompt = ["the car"]
|
||||
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
annotation_task=annotation_task,
|
||||
annotation_prompt=annotation_prompt,
|
||||
annotation_output_type="mask_image",
|
||||
num_inference_steps=35,
|
||||
guidance_scale=7.5,
|
||||
strength=0.95,
|
||||
output="images"
|
||||
)
|
||||
output[0].save("florence-inpainting.png")
|
||||
```
|
||||
|
||||
## Editing Custom Blocks
|
||||
|
||||
By default, custom blocks are saved in your cache directory. Use the `local_dir` argument to download and edit a custom block in a specific folder.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers.modular_pipelines import ModularPipelineBlocks, SequentialPipelineBlocks
|
||||
from diffusers.modular_pipelines.stable_diffusion_xl import INPAINT_BLOCKS
|
||||
from diffusers.utils import load_image
|
||||
|
||||
# Fetch the Florence2 image annotator block that will create our mask
|
||||
image_annotator_block = ModularPipelineBlocks.from_pretrained("diffusers/florence-2-custom-block", trust_remote_code=True, local_dir="/my-local-folder")
|
||||
```
|
||||
|
||||
Any changes made to the block files in this folder will be reflected when you load the block again.
|
||||
@@ -0,0 +1,46 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# AutoModel
|
||||
|
||||
The [`AutoModel`] class automatically detects and loads the correct model class (UNet, transformer, VAE) from a `config.json` file. You don't need to know the specific model class name ahead of time. It supports data types and device placement, and works across model types and libraries.
|
||||
|
||||
The example below loads a transformer from Diffusers and a text encoder from Transformers. Use the `subfolder` parameter to specify where to load the `config.json` file from.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoModel, DiffusionPipeline
|
||||
|
||||
transformer = AutoModel.from_pretrained(
|
||||
"Qwen/Qwen-Image", subfolder="transformer", torch_dtype=torch.bfloat16, device_map="cuda"
|
||||
)
|
||||
|
||||
text_encoder = AutoModel.from_pretrained(
|
||||
"Qwen/Qwen-Image", subfolder="text_encoder", torch_dtype=torch.bfloat16, device_map="cuda"
|
||||
)
|
||||
```
|
||||
|
||||
[`AutoModel`] also loads models from the [Hub](https://huggingface.co/models) that aren't included in Diffusers. Set `trust_remote_code=True` in [`AutoModel.from_pretrained`] to load custom models.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoModel
|
||||
|
||||
transformer = AutoModel.from_pretrained(
|
||||
"custom/custom-transformer-model", trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="cuda"
|
||||
)
|
||||
```
|
||||
|
||||
If the custom model inherits from the [`ModelMixin`] class, it gets access to the same features as Diffusers model classes, like [regional compilation](../optimization/fp16#regional-compilation) and [group offloading](../optimization/memory#group-offloading).
|
||||
|
||||
> [!NOTE]
|
||||
> Learn more about implementing custom models in the [Community components](../using-diffusers/custom_pipeline_overview#community-components) guide.
|
||||
@@ -5488,7 +5488,7 @@ Editing at Scale", many thanks to their contribution!
|
||||
|
||||
This implementation of Flux Kontext allows users to pass multiple reference images. Each image is encoded separately, and the resulting latent vectors are concatenated.
|
||||
|
||||
As explained in Section 3 of [the paper](https://arxiv.org/pdf/2506.15742), the model's sequence concatenation mechanism can extend its capabilities to handle multiple reference images. However, note that the current version of Flux Kontext was not trained for this use case. In practice, stacking along the first axis does not yield correct results, while stacking along the other two axes appears to work.
|
||||
As explained in Section 3 of [the paper](https://huggingface.co/papers/2506.15742), the model's sequence concatenation mechanism can extend its capabilities to handle multiple reference images. However, note that the current version of Flux Kontext was not trained for this use case. In practice, stacking along the first axis does not yield correct results, while stacking along the other two axes appears to work.
|
||||
|
||||
## Example Usage
|
||||
|
||||
|
||||
@@ -490,7 +490,7 @@ class RegionalPromptingStableDiffusionPipeline(
|
||||
def prepare_extra_step_kwargs(self, generator, eta):
|
||||
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
||||
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
||||
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
||||
# eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
|
||||
# and should be between [0, 1]
|
||||
|
||||
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
||||
@@ -841,7 +841,7 @@ class RegionalPromptingStableDiffusionPipeline(
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
|
||||
Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only applies
|
||||
to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
||||
@@ -872,7 +872,7 @@ class RegionalPromptingStableDiffusionPipeline(
|
||||
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
||||
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
||||
Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
|
||||
Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
|
||||
using zero terminal SNR.
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
@@ -1062,7 +1062,7 @@ class RegionalPromptingStableDiffusionPipeline(
|
||||
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
|
||||
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
|
||||
# Based on 3.4. in https://huggingface.co/papers/2305.08891
|
||||
noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
@@ -1668,7 +1668,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
||||
r"""
|
||||
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
|
||||
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
||||
Flawed](https://huggingface.co/papers/2305.08891).
|
||||
|
||||
Args:
|
||||
noise_cfg (`torch.Tensor`):
|
||||
|
||||
@@ -268,12 +268,11 @@ provide a simple script for LoRA fine-tuning Kontext in [train_dreambooth_lora_f
|
||||
**important**
|
||||
|
||||
> [!NOTE]
|
||||
> To make sure you can successfully run the latest version of the kontext example script, we highly recommend installing from source, specifically from the commit mentioned below.
|
||||
> To make sure you can successfully run the latest version of the kontext example script, we highly recommend installing from source.
|
||||
> To do this, execute the following steps in a new virtual environment:
|
||||
> ```
|
||||
> git clone https://github.com/huggingface/diffusers
|
||||
> cd diffusers
|
||||
> git checkout 05e7a854d0a5661f5b433f6dd5954c224b104f0b
|
||||
> pip install -e .
|
||||
> ```
|
||||
|
||||
|
||||
@@ -6,11 +6,20 @@ import torch
|
||||
from accelerate import init_empty_weights
|
||||
from huggingface_hub import hf_hub_download, snapshot_download
|
||||
from safetensors.torch import load_file
|
||||
from transformers import AutoProcessor, AutoTokenizer, CLIPVisionModelWithProjection, UMT5EncoderModel
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
AutoTokenizer,
|
||||
CLIPImageProcessor,
|
||||
CLIPVisionModel,
|
||||
CLIPVisionModelWithProjection,
|
||||
UMT5EncoderModel,
|
||||
)
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKLWan,
|
||||
UniPCMultistepScheduler,
|
||||
WanAnimatePipeline,
|
||||
WanAnimateTransformer3DModel,
|
||||
WanImageToVideoPipeline,
|
||||
WanPipeline,
|
||||
WanTransformer3DModel,
|
||||
@@ -105,8 +114,203 @@ VACE_TRANSFORMER_KEYS_RENAME_DICT = {
|
||||
"after_proj": "proj_out",
|
||||
}
|
||||
|
||||
ANIMATE_TRANSFORMER_KEYS_RENAME_DICT = {
|
||||
"time_embedding.0": "condition_embedder.time_embedder.linear_1",
|
||||
"time_embedding.2": "condition_embedder.time_embedder.linear_2",
|
||||
"text_embedding.0": "condition_embedder.text_embedder.linear_1",
|
||||
"text_embedding.2": "condition_embedder.text_embedder.linear_2",
|
||||
"time_projection.1": "condition_embedder.time_proj",
|
||||
"head.modulation": "scale_shift_table",
|
||||
"head.head": "proj_out",
|
||||
"modulation": "scale_shift_table",
|
||||
"ffn.0": "ffn.net.0.proj",
|
||||
"ffn.2": "ffn.net.2",
|
||||
# Hack to swap the layer names
|
||||
# The original model calls the norms in following order: norm1, norm3, norm2
|
||||
# We convert it to: norm1, norm2, norm3
|
||||
"norm2": "norm__placeholder",
|
||||
"norm3": "norm2",
|
||||
"norm__placeholder": "norm3",
|
||||
"img_emb.proj.0": "condition_embedder.image_embedder.norm1",
|
||||
"img_emb.proj.1": "condition_embedder.image_embedder.ff.net.0.proj",
|
||||
"img_emb.proj.3": "condition_embedder.image_embedder.ff.net.2",
|
||||
"img_emb.proj.4": "condition_embedder.image_embedder.norm2",
|
||||
# Add attention component mappings
|
||||
"self_attn.q": "attn1.to_q",
|
||||
"self_attn.k": "attn1.to_k",
|
||||
"self_attn.v": "attn1.to_v",
|
||||
"self_attn.o": "attn1.to_out.0",
|
||||
"self_attn.norm_q": "attn1.norm_q",
|
||||
"self_attn.norm_k": "attn1.norm_k",
|
||||
"cross_attn.q": "attn2.to_q",
|
||||
"cross_attn.k": "attn2.to_k",
|
||||
"cross_attn.v": "attn2.to_v",
|
||||
"cross_attn.o": "attn2.to_out.0",
|
||||
"cross_attn.norm_q": "attn2.norm_q",
|
||||
"cross_attn.norm_k": "attn2.norm_k",
|
||||
"cross_attn.k_img": "attn2.to_k_img",
|
||||
"cross_attn.v_img": "attn2.to_v_img",
|
||||
"cross_attn.norm_k_img": "attn2.norm_k_img",
|
||||
# After cross_attn -> attn2 rename, we need to rename the img keys
|
||||
"attn2.to_k_img": "attn2.add_k_proj",
|
||||
"attn2.to_v_img": "attn2.add_v_proj",
|
||||
"attn2.norm_k_img": "attn2.norm_added_k",
|
||||
# Wan Animate-specific mappings (motion encoder, face encoder, face adapter)
|
||||
# Motion encoder mappings
|
||||
# The name mapping is complicated for the convolutional part so we handle that in its own function
|
||||
"motion_encoder.enc.fc": "motion_encoder.motion_network",
|
||||
"motion_encoder.dec.direction.weight": "motion_encoder.motion_synthesis_weight",
|
||||
# Face encoder mappings - CausalConv1d has a .conv submodule that we need to flatten
|
||||
"face_encoder.conv1_local.conv": "face_encoder.conv1_local",
|
||||
"face_encoder.conv2.conv": "face_encoder.conv2",
|
||||
"face_encoder.conv3.conv": "face_encoder.conv3",
|
||||
# Face adapter mappings are handled in a separate function
|
||||
}
|
||||
|
||||
|
||||
# TODO: Verify this and simplify if possible.
|
||||
def convert_animate_motion_encoder_weights(key: str, state_dict: Dict[str, Any], final_conv_idx: int = 8) -> None:
|
||||
"""
|
||||
Convert all motion encoder weights for Animate model.
|
||||
|
||||
In the original model:
|
||||
- All Linear layers in fc use EqualLinear
|
||||
- All Conv2d layers in convs use EqualConv2d (except blur_conv which is initialized separately)
|
||||
- Blur kernels are stored as buffers in Sequential modules
|
||||
- ConvLayer is nn.Sequential with indices: [Blur (optional), EqualConv2d, FusedLeakyReLU (optional)]
|
||||
|
||||
Conversion strategy:
|
||||
1. Drop .kernel buffers (blur kernels)
|
||||
2. Rename sequential indices to named components (e.g., 0 -> conv2d, 1 -> bias_leaky_relu)
|
||||
"""
|
||||
# Skip if not a weight, bias, or kernel
|
||||
if ".weight" not in key and ".bias" not in key and ".kernel" not in key:
|
||||
return
|
||||
|
||||
# Handle Blur kernel buffers from original implementation.
|
||||
# After renaming, these appear under: motion_encoder.res_blocks.*.conv{2,skip}.blur_kernel
|
||||
# Diffusers constructs blur kernels as a non-persistent buffer so we must drop these keys
|
||||
if ".kernel" in key and "motion_encoder" in key:
|
||||
# Remove unexpected blur kernel buffers to avoid strict load errors
|
||||
state_dict.pop(key, None)
|
||||
return
|
||||
|
||||
# Rename Sequential indices to named components in ConvLayer and ResBlock
|
||||
if ".enc.net_app.convs." in key and (".weight" in key or ".bias" in key):
|
||||
parts = key.split(".")
|
||||
|
||||
# Find the sequential index (digit) after convs or after conv1/conv2/skip
|
||||
# Examples:
|
||||
# - enc.net_app.convs.0.0.weight -> conv_in.weight (initial conv layer weight)
|
||||
# - enc.net_app.convs.0.1.bias -> conv_in.act_fn.bias (initial conv layer bias)
|
||||
# - enc.net_app.convs.{n:1-7}.conv1.0.weight -> res_blocks.{(n-1):0-6}.conv1.weight (conv1 weight)
|
||||
# - e.g. enc.net_app.convs.1.conv1.0.weight -> res_blocks.0.conv1.weight
|
||||
# - enc.net_app.convs.{n:1-7}.conv1.1.bias -> res_blocks.{(n-1):0-6}.conv1.act_fn.bias (conv1 bias)
|
||||
# - e.g. enc.net_app.convs.1.conv1.1.bias -> res_blocks.0.conv1.act_fn.bias
|
||||
# - enc.net_app.convs.{n:1-7}.conv2.1.weight -> res_blocks.{(n-1):0-6}.conv2.weight (conv2 weight)
|
||||
# - enc.net_app.convs.1.conv2.2.bias -> res_blocks.0.conv2.act_fn.bias (conv2 bias)
|
||||
# - enc.net_app.convs.{n:1-7}.skip.1.weight -> res_blocks.{(n-1):0-6}.conv_skip.weight (skip conv weight)
|
||||
# - enc.net_app.convs.8 -> conv_out (final conv layer)
|
||||
|
||||
convs_idx = parts.index("convs") if "convs" in parts else -1
|
||||
if convs_idx >= 0 and len(parts) - convs_idx >= 2:
|
||||
bias = False
|
||||
# The nn.Sequential index will always follow convs
|
||||
sequential_idx = int(parts[convs_idx + 1])
|
||||
if sequential_idx == 0:
|
||||
if key.endswith(".weight"):
|
||||
new_key = "motion_encoder.conv_in.weight"
|
||||
elif key.endswith(".bias"):
|
||||
new_key = "motion_encoder.conv_in.act_fn.bias"
|
||||
bias = True
|
||||
elif sequential_idx == final_conv_idx:
|
||||
if key.endswith(".weight"):
|
||||
new_key = "motion_encoder.conv_out.weight"
|
||||
else:
|
||||
# Intermediate .convs. layers, which get mapped to .res_blocks.
|
||||
prefix = "motion_encoder.res_blocks."
|
||||
|
||||
layer_name = parts[convs_idx + 2]
|
||||
if layer_name == "skip":
|
||||
layer_name = "conv_skip"
|
||||
|
||||
if key.endswith(".weight"):
|
||||
param_name = "weight"
|
||||
elif key.endswith(".bias"):
|
||||
param_name = "act_fn.bias"
|
||||
bias = True
|
||||
|
||||
suffix_parts = [str(sequential_idx - 1), layer_name, param_name]
|
||||
suffix = ".".join(suffix_parts)
|
||||
new_key = prefix + suffix
|
||||
|
||||
param = state_dict.pop(key)
|
||||
if bias:
|
||||
param = param.squeeze()
|
||||
state_dict[new_key] = param
|
||||
return
|
||||
return
|
||||
return
|
||||
|
||||
|
||||
def convert_animate_face_adapter_weights(key: str, state_dict: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Convert face adapter weights for the Animate model.
|
||||
|
||||
The original model uses a fused KV projection but the diffusers models uses separate K and V projections.
|
||||
"""
|
||||
# Skip if not a weight or bias
|
||||
if ".weight" not in key and ".bias" not in key:
|
||||
return
|
||||
|
||||
prefix = "face_adapter."
|
||||
if ".fuser_blocks." in key:
|
||||
parts = key.split(".")
|
||||
|
||||
module_list_idx = parts.index("fuser_blocks") if "fuser_blocks" in parts else -1
|
||||
if module_list_idx >= 0 and (len(parts) - 1) - module_list_idx == 3:
|
||||
block_idx = parts[module_list_idx + 1]
|
||||
layer_name = parts[module_list_idx + 2]
|
||||
param_name = parts[module_list_idx + 3]
|
||||
|
||||
if layer_name == "linear1_kv":
|
||||
layer_name_k = "to_k"
|
||||
layer_name_v = "to_v"
|
||||
|
||||
suffix_k = ".".join([block_idx, layer_name_k, param_name])
|
||||
suffix_v = ".".join([block_idx, layer_name_v, param_name])
|
||||
new_key_k = prefix + suffix_k
|
||||
new_key_v = prefix + suffix_v
|
||||
|
||||
kv_proj = state_dict.pop(key)
|
||||
k_proj, v_proj = torch.chunk(kv_proj, 2, dim=0)
|
||||
state_dict[new_key_k] = k_proj
|
||||
state_dict[new_key_v] = v_proj
|
||||
return
|
||||
else:
|
||||
if layer_name == "q_norm":
|
||||
new_layer_name = "norm_q"
|
||||
elif layer_name == "k_norm":
|
||||
new_layer_name = "norm_k"
|
||||
elif layer_name == "linear1_q":
|
||||
new_layer_name = "to_q"
|
||||
elif layer_name == "linear2":
|
||||
new_layer_name = "to_out"
|
||||
|
||||
suffix_parts = [block_idx, new_layer_name, param_name]
|
||||
suffix = ".".join(suffix_parts)
|
||||
new_key = prefix + suffix
|
||||
state_dict[new_key] = state_dict.pop(key)
|
||||
return
|
||||
return
|
||||
|
||||
|
||||
TRANSFORMER_SPECIAL_KEYS_REMAP = {}
|
||||
VACE_TRANSFORMER_SPECIAL_KEYS_REMAP = {}
|
||||
ANIMATE_TRANSFORMER_SPECIAL_KEYS_REMAP = {
|
||||
"motion_encoder": convert_animate_motion_encoder_weights,
|
||||
"face_adapter": convert_animate_face_adapter_weights,
|
||||
}
|
||||
|
||||
|
||||
def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
|
||||
@@ -364,6 +568,37 @@ def get_transformer_config(model_type: str) -> Tuple[Dict[str, Any], ...]:
|
||||
}
|
||||
RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
|
||||
SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
|
||||
elif model_type == "Wan2.2-Animate-14B":
|
||||
config = {
|
||||
"model_id": "Wan-AI/Wan2.2-Animate-14B",
|
||||
"diffusers_config": {
|
||||
"image_dim": 1280,
|
||||
"added_kv_proj_dim": 5120,
|
||||
"attention_head_dim": 128,
|
||||
"cross_attn_norm": True,
|
||||
"eps": 1e-06,
|
||||
"ffn_dim": 13824,
|
||||
"freq_dim": 256,
|
||||
"in_channels": 36,
|
||||
"num_attention_heads": 40,
|
||||
"num_layers": 40,
|
||||
"out_channels": 16,
|
||||
"patch_size": (1, 2, 2),
|
||||
"qk_norm": "rms_norm_across_heads",
|
||||
"text_dim": 4096,
|
||||
"rope_max_seq_len": 1024,
|
||||
"pos_embed_seq_len": None,
|
||||
"motion_encoder_size": 512, # Start of Wan Animate-specific configs
|
||||
"motion_style_dim": 512,
|
||||
"motion_dim": 20,
|
||||
"motion_encoder_dim": 512,
|
||||
"face_encoder_hidden_dim": 1024,
|
||||
"face_encoder_num_heads": 4,
|
||||
"inject_face_latents_blocks": 5,
|
||||
},
|
||||
}
|
||||
RENAME_DICT = ANIMATE_TRANSFORMER_KEYS_RENAME_DICT
|
||||
SPECIAL_KEYS_REMAP = ANIMATE_TRANSFORMER_SPECIAL_KEYS_REMAP
|
||||
return config, RENAME_DICT, SPECIAL_KEYS_REMAP
|
||||
|
||||
|
||||
@@ -380,10 +615,12 @@ def convert_transformer(model_type: str, stage: str = None):
|
||||
original_state_dict = load_sharded_safetensors(model_dir)
|
||||
|
||||
with init_empty_weights():
|
||||
if "VACE" not in model_type:
|
||||
transformer = WanTransformer3DModel.from_config(diffusers_config)
|
||||
else:
|
||||
if "Animate" in model_type:
|
||||
transformer = WanAnimateTransformer3DModel.from_config(diffusers_config)
|
||||
elif "VACE" in model_type:
|
||||
transformer = WanVACETransformer3DModel.from_config(diffusers_config)
|
||||
else:
|
||||
transformer = WanTransformer3DModel.from_config(diffusers_config)
|
||||
|
||||
for key in list(original_state_dict.keys()):
|
||||
new_key = key[:]
|
||||
@@ -397,7 +634,12 @@ def convert_transformer(model_type: str, stage: str = None):
|
||||
continue
|
||||
handler_fn_inplace(key, original_state_dict)
|
||||
|
||||
# Load state dict into the meta model, which will materialize the tensors
|
||||
transformer.load_state_dict(original_state_dict, strict=True, assign=True)
|
||||
|
||||
# Move to CPU to ensure all tensors are materialized
|
||||
transformer = transformer.to("cpu")
|
||||
|
||||
return transformer
|
||||
|
||||
|
||||
@@ -926,7 +1168,7 @@ DTYPE_MAPPING = {
|
||||
if __name__ == "__main__":
|
||||
args = get_args()
|
||||
|
||||
if "Wan2.2" in args.model_type and "TI2V" not in args.model_type:
|
||||
if "Wan2.2" in args.model_type and "TI2V" not in args.model_type and "Animate" not in args.model_type:
|
||||
transformer = convert_transformer(args.model_type, stage="high_noise_model")
|
||||
transformer_2 = convert_transformer(args.model_type, stage="low_noise_model")
|
||||
else:
|
||||
@@ -942,7 +1184,7 @@ if __name__ == "__main__":
|
||||
tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl")
|
||||
if "FLF2V" in args.model_type:
|
||||
flow_shift = 16.0
|
||||
elif "TI2V" in args.model_type:
|
||||
elif "TI2V" in args.model_type or "Animate" in args.model_type:
|
||||
flow_shift = 5.0
|
||||
else:
|
||||
flow_shift = 3.0
|
||||
@@ -954,6 +1196,8 @@ if __name__ == "__main__":
|
||||
if args.dtype != "none":
|
||||
dtype = DTYPE_MAPPING[args.dtype]
|
||||
transformer.to(dtype)
|
||||
if transformer_2 is not None:
|
||||
transformer_2.to(dtype)
|
||||
|
||||
if "Wan2.2" and "I2V" in args.model_type and "TI2V" not in args.model_type:
|
||||
pipe = WanImageToVideoPipeline(
|
||||
@@ -1016,6 +1260,21 @@ if __name__ == "__main__":
|
||||
vae=vae,
|
||||
scheduler=scheduler,
|
||||
)
|
||||
elif "Animate" in args.model_type:
|
||||
image_encoder = CLIPVisionModel.from_pretrained(
|
||||
"laion/CLIP-ViT-H-14-laion2B-s32B-b79K", torch_dtype=torch.bfloat16
|
||||
)
|
||||
image_processor = CLIPImageProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
|
||||
|
||||
pipe = WanAnimatePipeline(
|
||||
transformer=transformer,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
vae=vae,
|
||||
scheduler=scheduler,
|
||||
image_encoder=image_encoder,
|
||||
image_processor=image_processor,
|
||||
)
|
||||
else:
|
||||
pipe = WanPipeline(
|
||||
transformer=transformer,
|
||||
|
||||
@@ -268,6 +268,7 @@ else:
|
||||
"UNetSpatioTemporalConditionModel",
|
||||
"UVit2DModel",
|
||||
"VQModel",
|
||||
"WanAnimateTransformer3DModel",
|
||||
"WanTransformer3DModel",
|
||||
"WanVACETransformer3DModel",
|
||||
"attention_backend",
|
||||
@@ -636,6 +637,7 @@ else:
|
||||
"VisualClozeGenerationPipeline",
|
||||
"VisualClozePipeline",
|
||||
"VQDiffusionPipeline",
|
||||
"WanAnimatePipeline",
|
||||
"WanImageToVideoPipeline",
|
||||
"WanPipeline",
|
||||
"WanVACEPipeline",
|
||||
@@ -977,6 +979,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
UNetSpatioTemporalConditionModel,
|
||||
UVit2DModel,
|
||||
VQModel,
|
||||
WanAnimateTransformer3DModel,
|
||||
WanTransformer3DModel,
|
||||
WanVACETransformer3DModel,
|
||||
attention_backend,
|
||||
@@ -1315,6 +1318,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
VisualClozeGenerationPipeline,
|
||||
VisualClozePipeline,
|
||||
VQDiffusionPipeline,
|
||||
WanAnimatePipeline,
|
||||
WanImageToVideoPipeline,
|
||||
WanPipeline,
|
||||
WanVACEPipeline,
|
||||
|
||||
@@ -373,7 +373,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
||||
r"""
|
||||
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
|
||||
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
||||
Flawed](https://huggingface.co/papers/2305.08891).
|
||||
|
||||
Args:
|
||||
noise_cfg (`torch.Tensor`):
|
||||
|
||||
@@ -409,7 +409,7 @@ class VaeImageProcessor(ConfigMixin):
|
||||
src_w = width if ratio < src_ratio else image.width * height // image.height
|
||||
src_h = height if ratio >= src_ratio else image.height * width // image.width
|
||||
|
||||
resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION["lanczos"])
|
||||
resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION[self.config.resample])
|
||||
res = Image.new("RGB", (width, height))
|
||||
res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
|
||||
|
||||
@@ -460,7 +460,7 @@ class VaeImageProcessor(ConfigMixin):
|
||||
src_w = width if ratio > src_ratio else image.width * height // image.height
|
||||
src_h = height if ratio <= src_ratio else image.height * width // image.width
|
||||
|
||||
resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION["lanczos"])
|
||||
resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION[self.config.resample])
|
||||
res = Image.new("RGB", (width, height))
|
||||
res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
|
||||
return res
|
||||
|
||||
@@ -108,6 +108,7 @@ if is_torch_available():
|
||||
_import_structure["transformers.transformer_skyreels_v2"] = ["SkyReelsV2Transformer3DModel"]
|
||||
_import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
|
||||
_import_structure["transformers.transformer_wan"] = ["WanTransformer3DModel"]
|
||||
_import_structure["transformers.transformer_wan_animate"] = ["WanAnimateTransformer3DModel"]
|
||||
_import_structure["transformers.transformer_wan_vace"] = ["WanVACETransformer3DModel"]
|
||||
_import_structure["unets.unet_1d"] = ["UNet1DModel"]
|
||||
_import_structure["unets.unet_2d"] = ["UNet2DModel"]
|
||||
@@ -214,6 +215,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
T5FilmDecoder,
|
||||
Transformer2DModel,
|
||||
TransformerTemporalModel,
|
||||
WanAnimateTransformer3DModel,
|
||||
WanTransformer3DModel,
|
||||
WanVACETransformer3DModel,
|
||||
)
|
||||
|
||||
@@ -383,12 +383,18 @@ def _check_shape(
|
||||
attn_mask: Optional[torch.Tensor] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
# Expected shapes:
|
||||
# query: (batch_size, seq_len_q, num_heads, head_dim)
|
||||
# key: (batch_size, seq_len_kv, num_heads, head_dim)
|
||||
# value: (batch_size, seq_len_kv, num_heads, head_dim)
|
||||
# attn_mask: (seq_len_q, seq_len_kv) or (batch_size, seq_len_q, seq_len_kv)
|
||||
# or (batch_size, num_heads, seq_len_q, seq_len_kv)
|
||||
if query.shape[-1] != key.shape[-1]:
|
||||
raise ValueError("Query and key must have the same last dimension.")
|
||||
if query.shape[-2] != value.shape[-2]:
|
||||
raise ValueError("Query and value must have the same second to last dimension.")
|
||||
if attn_mask is not None and attn_mask.shape[-1] != key.shape[-2]:
|
||||
raise ValueError("Attention mask must match the key's second to last dimension.")
|
||||
raise ValueError("Query and key must have the same head dimension.")
|
||||
if key.shape[-3] != value.shape[-3]:
|
||||
raise ValueError("Key and value must have the same sequence length.")
|
||||
if attn_mask is not None and attn_mask.shape[-1] != key.shape[-3]:
|
||||
raise ValueError("Attention mask must match the key's sequence length.")
|
||||
|
||||
|
||||
# ===== Helper functions =====
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
# QwenImageVAE is further fine-tuned from the Wan Video VAE to achieve improved performance.
|
||||
# For more information about the Wan VAE, please refer to:
|
||||
# - GitHub: https://github.com/Wan-Video/Wan2.1
|
||||
# - arXiv: https://arxiv.org/abs/2503.20314
|
||||
# - Paper: https://huggingface.co/papers/2503.20314
|
||||
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
|
||||
@@ -42,4 +42,5 @@ if is_torch_available():
|
||||
from .transformer_skyreels_v2 import SkyReelsV2Transformer3DModel
|
||||
from .transformer_temporal import TransformerTemporalModel
|
||||
from .transformer_wan import WanTransformer3DModel
|
||||
from .transformer_wan_animate import WanAnimateTransformer3DModel
|
||||
from .transformer_wan_vace import WanVACETransformer3DModel
|
||||
|
||||
@@ -275,7 +275,12 @@ class PRXEmbedND(nn.Module):
|
||||
|
||||
def rope(self, pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
|
||||
assert dim % 2 == 0
|
||||
scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
|
||||
|
||||
is_mps = pos.device.type == "mps"
|
||||
is_npu = pos.device.type == "npu"
|
||||
dtype = torch.float32 if (is_mps or is_npu) else torch.float64
|
||||
|
||||
scale = torch.arange(0, dim, 2, dtype=dtype, device=pos.device) / dim
|
||||
omega = 1.0 / (theta**scale)
|
||||
out = pos.unsqueeze(-1) * omega.unsqueeze(0)
|
||||
out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
|
||||
|
||||
@@ -172,7 +172,6 @@ class SanaLinearAttnProcessor3_0:
|
||||
return hidden_states
|
||||
|
||||
|
||||
# Copied from diffusers.models.transformers.transformer_wan.WanRotaryPosEmbed
|
||||
class WanRotaryPosEmbed(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
@@ -189,6 +188,11 @@ class WanRotaryPosEmbed(nn.Module):
|
||||
|
||||
h_dim = w_dim = 2 * (attention_head_dim // 6)
|
||||
t_dim = attention_head_dim - h_dim - w_dim
|
||||
|
||||
self.t_dim = t_dim
|
||||
self.h_dim = h_dim
|
||||
self.w_dim = w_dim
|
||||
|
||||
freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
|
||||
|
||||
freqs_cos = []
|
||||
@@ -214,11 +218,7 @@ class WanRotaryPosEmbed(nn.Module):
|
||||
p_t, p_h, p_w = self.patch_size
|
||||
ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
|
||||
|
||||
split_sizes = [
|
||||
self.attention_head_dim - 2 * (self.attention_head_dim // 3),
|
||||
self.attention_head_dim // 3,
|
||||
self.attention_head_dim // 3,
|
||||
]
|
||||
split_sizes = [self.t_dim, self.h_dim, self.w_dim]
|
||||
|
||||
freqs_cos = self.freqs_cos.split(split_sizes, dim=1)
|
||||
freqs_sin = self.freqs_sin.split(split_sizes, dim=1)
|
||||
|
||||
@@ -389,6 +389,10 @@ class SkyReelsV2RotaryPosEmbed(nn.Module):
|
||||
t_dim = attention_head_dim - h_dim - w_dim
|
||||
freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
|
||||
|
||||
self.t_dim = t_dim
|
||||
self.h_dim = h_dim
|
||||
self.w_dim = w_dim
|
||||
|
||||
freqs_cos = []
|
||||
freqs_sin = []
|
||||
|
||||
@@ -412,11 +416,7 @@ class SkyReelsV2RotaryPosEmbed(nn.Module):
|
||||
p_t, p_h, p_w = self.patch_size
|
||||
ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
|
||||
|
||||
split_sizes = [
|
||||
self.attention_head_dim - 2 * (self.attention_head_dim // 3),
|
||||
self.attention_head_dim // 3,
|
||||
self.attention_head_dim // 3,
|
||||
]
|
||||
split_sizes = [self.t_dim, self.h_dim, self.w_dim]
|
||||
|
||||
freqs_cos = self.freqs_cos.split(split_sizes, dim=1)
|
||||
freqs_sin = self.freqs_sin.split(split_sizes, dim=1)
|
||||
|
||||
@@ -362,6 +362,11 @@ class WanRotaryPosEmbed(nn.Module):
|
||||
|
||||
h_dim = w_dim = 2 * (attention_head_dim // 6)
|
||||
t_dim = attention_head_dim - h_dim - w_dim
|
||||
|
||||
self.t_dim = t_dim
|
||||
self.h_dim = h_dim
|
||||
self.w_dim = w_dim
|
||||
|
||||
freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
|
||||
|
||||
freqs_cos = []
|
||||
@@ -387,11 +392,7 @@ class WanRotaryPosEmbed(nn.Module):
|
||||
p_t, p_h, p_w = self.patch_size
|
||||
ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
|
||||
|
||||
split_sizes = [
|
||||
self.attention_head_dim - 2 * (self.attention_head_dim // 3),
|
||||
self.attention_head_dim // 3,
|
||||
self.attention_head_dim // 3,
|
||||
]
|
||||
split_sizes = [self.t_dim, self.h_dim, self.w_dim]
|
||||
|
||||
freqs_cos = self.freqs_cos.split(split_sizes, dim=1)
|
||||
freqs_sin = self.freqs_sin.split(split_sizes, dim=1)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -861,6 +861,10 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
|
||||
else:
|
||||
sub_blocks[block_name] = block
|
||||
self.sub_blocks = sub_blocks
|
||||
if not len(self.block_names) == len(self.block_classes):
|
||||
raise ValueError(
|
||||
f"In {self.__class__.__name__}, the number of block_names and block_classes must be the same."
|
||||
)
|
||||
|
||||
def _get_inputs(self):
|
||||
inputs = []
|
||||
|
||||
@@ -132,6 +132,7 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
|
||||
@property
|
||||
def inputs(self) -> List[InputParam]:
|
||||
return [
|
||||
InputParam("latents"),
|
||||
InputParam(name="height"),
|
||||
InputParam(name="width"),
|
||||
InputParam(name="num_images_per_prompt", default=1),
|
||||
@@ -196,11 +197,11 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
|
||||
f"You have passed a list of generators of length {len(block_state.generator)}, but requested an effective batch"
|
||||
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
||||
)
|
||||
|
||||
block_state.latents = randn_tensor(
|
||||
shape, generator=block_state.generator, device=device, dtype=block_state.dtype
|
||||
)
|
||||
block_state.latents = components.pachifier.pack_latents(block_state.latents)
|
||||
if block_state.latents is None:
|
||||
block_state.latents = randn_tensor(
|
||||
shape, generator=block_state.generator, device=device, dtype=block_state.dtype
|
||||
)
|
||||
block_state.latents = components.pachifier.pack_latents(block_state.latents)
|
||||
|
||||
self.set_block_state(state, block_state)
|
||||
return components, state
|
||||
@@ -549,8 +550,7 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
|
||||
block_state.width // components.vae_scale_factor // 2,
|
||||
)
|
||||
]
|
||||
* block_state.batch_size
|
||||
]
|
||||
] * block_state.batch_size
|
||||
block_state.txt_seq_lens = (
|
||||
block_state.prompt_embeds_mask.sum(dim=1).tolist() if block_state.prompt_embeds_mask is not None else None
|
||||
)
|
||||
|
||||
@@ -74,8 +74,9 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
|
||||
block_state = self.get_block_state(state)
|
||||
|
||||
# YiYi Notes: remove support for output_type = "latents', we can just skip decode/encode step in modular
|
||||
vae_scale_factor = components.vae_scale_factor
|
||||
block_state.latents = components.pachifier.unpack_latents(
|
||||
block_state.latents, block_state.height, block_state.width
|
||||
block_state.latents, block_state.height, block_state.width, vae_scale_factor=vae_scale_factor
|
||||
)
|
||||
block_state.latents = block_state.latents.to(components.vae.dtype)
|
||||
|
||||
|
||||
@@ -503,6 +503,8 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks):
|
||||
block_state.prompt_embeds = block_state.prompt_embeds[:, : block_state.max_sequence_length]
|
||||
block_state.prompt_embeds_mask = block_state.prompt_embeds_mask[:, : block_state.max_sequence_length]
|
||||
|
||||
block_state.negative_prompt_embeds = None
|
||||
block_state.negative_prompt_embeds_mask = None
|
||||
if components.requires_unconditional_embeds:
|
||||
negative_prompt = block_state.negative_prompt or ""
|
||||
block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds(
|
||||
@@ -627,6 +629,8 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
|
||||
device=device,
|
||||
)
|
||||
|
||||
block_state.negative_prompt_embeds = None
|
||||
block_state.negative_prompt_embeds_mask = None
|
||||
if components.requires_unconditional_embeds:
|
||||
negative_prompt = block_state.negative_prompt or " "
|
||||
block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds_edit(
|
||||
@@ -679,6 +683,8 @@ class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
|
||||
device=device,
|
||||
)
|
||||
|
||||
block_state.negative_prompt_embeds = None
|
||||
block_state.negative_prompt_embeds_mask = None
|
||||
if components.requires_unconditional_embeds:
|
||||
negative_prompt = block_state.negative_prompt or " "
|
||||
block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = (
|
||||
|
||||
@@ -523,7 +523,7 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
QwenImageOptionalControlNetBeforeDenoiseStep,
|
||||
QwenImageAutoDenoiseStep,
|
||||
]
|
||||
block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise", "decode"]
|
||||
block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise"]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
@@ -534,7 +534,6 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
+ " - `QwenImageAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
|
||||
+ " - `QwenImageOptionalControlNetBeforeDenoiseStep` (controlnet_before_denoise) prepares the controlnet input for the denoising step.\n"
|
||||
+ " - `QwenImageAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
|
||||
+ " - `QwenImageAutoDecodeStep` (decode) decodes the latents into images.\n\n"
|
||||
+ "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n"
|
||||
+ " - for image-to-image generation, you need to provide `image_latents`\n"
|
||||
+ " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n"
|
||||
|
||||
@@ -26,10 +26,7 @@ class QwenImagePachifier(ConfigMixin):
|
||||
config_name = "config.json"
|
||||
|
||||
@register_to_config
|
||||
def __init__(
|
||||
self,
|
||||
patch_size: int = 2,
|
||||
):
|
||||
def __init__(self, patch_size: int = 2):
|
||||
super().__init__()
|
||||
|
||||
def pack_latents(self, latents):
|
||||
|
||||
@@ -385,7 +385,13 @@ else:
|
||||
"WuerstchenDecoderPipeline",
|
||||
"WuerstchenPriorPipeline",
|
||||
]
|
||||
_import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline", "WanVideoToVideoPipeline", "WanVACEPipeline"]
|
||||
_import_structure["wan"] = [
|
||||
"WanPipeline",
|
||||
"WanImageToVideoPipeline",
|
||||
"WanVideoToVideoPipeline",
|
||||
"WanVACEPipeline",
|
||||
"WanAnimatePipeline",
|
||||
]
|
||||
_import_structure["kandinsky5"] = ["Kandinsky5T2VPipeline"]
|
||||
_import_structure["skyreels_v2"] = [
|
||||
"SkyReelsV2DiffusionForcingPipeline",
|
||||
@@ -803,7 +809,13 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
UniDiffuserTextDecoder,
|
||||
)
|
||||
from .visualcloze import VisualClozeGenerationPipeline, VisualClozePipeline
|
||||
from .wan import WanImageToVideoPipeline, WanPipeline, WanVACEPipeline, WanVideoToVideoPipeline
|
||||
from .wan import (
|
||||
WanAnimatePipeline,
|
||||
WanImageToVideoPipeline,
|
||||
WanPipeline,
|
||||
WanVACEPipeline,
|
||||
WanVideoToVideoPipeline,
|
||||
)
|
||||
from .wuerstchen import (
|
||||
WuerstchenCombinedPipeline,
|
||||
WuerstchenDecoderPipeline,
|
||||
|
||||
@@ -245,7 +245,7 @@ class BriaPipeline(DiffusionPipeline):
|
||||
return self._guidance_scale
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
# of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
|
||||
# corresponds to doing no classifier free guidance.
|
||||
@property
|
||||
def do_classifier_free_guidance(self):
|
||||
@@ -489,11 +489,11 @@ class BriaPipeline(DiffusionPipeline):
|
||||
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
||||
passed will be used. Must be in descending order.
|
||||
guidance_scale (`float`, *optional*, defaults to 5.0):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
Guidance scale as defined in [Classifier-Free Diffusion
|
||||
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
||||
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
||||
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
||||
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
||||
|
||||
@@ -337,7 +337,7 @@ class BriaFiboPipeline(DiffusionPipeline):
|
||||
return self._guidance_scale
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
# of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
|
||||
# corresponds to doing no classifier free guidance.
|
||||
|
||||
@property
|
||||
@@ -498,11 +498,11 @@ class BriaFiboPipeline(DiffusionPipeline):
|
||||
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
||||
passed will be used. Must be in descending order.
|
||||
guidance_scale (`float`, *optional*, defaults to 5.0):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
Guidance scale as defined in [Classifier-Free Diffusion
|
||||
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
||||
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
||||
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
||||
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
||||
|
||||
@@ -590,9 +590,10 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
||||
Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
|
||||
[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
||||
Guidance rescale factor should fix overexposure when using zero terminal SNR.
|
||||
Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
|
||||
[Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
|
||||
using zero terminal SNR.
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of videos to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
@@ -777,7 +778,7 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
|
||||
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
if self.guidance_rescale > 0:
|
||||
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
|
||||
# Based on 3.4. in https://huggingface.co/papers/2305.08891
|
||||
noise_pred = rescale_noise_cfg(
|
||||
noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
|
||||
)
|
||||
|
||||
@@ -927,9 +927,10 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
||||
Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
|
||||
[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
||||
Guidance rescale factor should fix overexposure when using zero terminal SNR.
|
||||
Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
|
||||
[Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
|
||||
using zero terminal SNR.
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of videos to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
@@ -1194,7 +1195,7 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
|
||||
timestep, _ = timestep.chunk(2)
|
||||
|
||||
if self.guidance_rescale > 0:
|
||||
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
|
||||
# Based on 3.4. in https://huggingface.co/papers/2305.08891
|
||||
noise_pred = rescale_noise_cfg(
|
||||
noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
|
||||
)
|
||||
|
||||
@@ -654,9 +654,10 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
||||
Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
|
||||
[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
||||
Guidance rescale factor should fix overexposure when using zero terminal SNR.
|
||||
Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
|
||||
[Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
|
||||
using zero terminal SNR.
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of videos to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
@@ -851,7 +852,7 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
|
||||
timestep, _ = timestep.chunk(2)
|
||||
|
||||
if self.guidance_rescale > 0:
|
||||
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
|
||||
# Based on 3.4. in https://huggingface.co/papers/2305.08891
|
||||
noise_pred = rescale_noise_cfg(
|
||||
noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
|
||||
)
|
||||
|
||||
@@ -69,6 +69,39 @@ ASPECT_RATIO_512_BIN = {
|
||||
"2.0": [704, 352],
|
||||
}
|
||||
|
||||
ASPECT_RATIO_1024_BIN = {
|
||||
"0.49": [704, 1440],
|
||||
"0.52": [736, 1408],
|
||||
"0.53": [736, 1376],
|
||||
"0.57": [768, 1344],
|
||||
"0.59": [768, 1312],
|
||||
"0.62": [800, 1280],
|
||||
"0.67": [832, 1248],
|
||||
"0.68": [832, 1216],
|
||||
"0.78": [896, 1152],
|
||||
"0.83": [928, 1120],
|
||||
"0.94": [992, 1056],
|
||||
"1.0": [1024, 1024],
|
||||
"1.06": [1056, 992],
|
||||
"1.13": [1088, 960],
|
||||
"1.21": [1120, 928],
|
||||
"1.29": [1152, 896],
|
||||
"1.37": [1184, 864],
|
||||
"1.46": [1216, 832],
|
||||
"1.5": [1248, 832],
|
||||
"1.71": [1312, 768],
|
||||
"1.75": [1344, 768],
|
||||
"1.87": [1376, 736],
|
||||
"1.91": [1408, 736],
|
||||
"2.05": [1440, 704],
|
||||
}
|
||||
|
||||
ASPECT_RATIO_BINS = {
|
||||
256: ASPECT_RATIO_256_BIN,
|
||||
512: ASPECT_RATIO_512_BIN,
|
||||
1024: ASPECT_RATIO_1024_BIN,
|
||||
}
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@@ -536,11 +569,11 @@ class PRXPipeline(
|
||||
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
||||
passed will be used. Must be in descending order.
|
||||
guidance_scale (`float`, *optional*, defaults to 4.0):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
Guidance scale as defined in [Classifier-Free Diffusion
|
||||
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
||||
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
||||
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
@@ -600,10 +633,12 @@ class PRXPipeline(
|
||||
"Resolution binning requires a VAE with image_processor, but VAE is not available. "
|
||||
"Set use_resolution_binning=False or provide a VAE."
|
||||
)
|
||||
if self.default_sample_size <= 256:
|
||||
aspect_ratio_bin = ASPECT_RATIO_256_BIN
|
||||
else:
|
||||
aspect_ratio_bin = ASPECT_RATIO_512_BIN
|
||||
if self.default_sample_size not in ASPECT_RATIO_BINS:
|
||||
raise ValueError(
|
||||
f"Resolution binning is only supported for default_sample_size in {list(ASPECT_RATIO_BINS.keys())}, "
|
||||
f"but got {self.default_sample_size}. Set use_resolution_binning=False to disable aspect ratio binning."
|
||||
)
|
||||
aspect_ratio_bin = ASPECT_RATIO_BINS[self.default_sample_size]
|
||||
|
||||
# Store original dimensions
|
||||
orig_height, orig_width = height, width
|
||||
|
||||
@@ -415,11 +415,11 @@ class SkyReelsV2Pipeline(DiffusionPipeline, SkyReelsV2LoraLoaderMixin):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, defaults to `6.0`):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
Guidance scale as defined in [Classifier-Free Diffusion
|
||||
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
||||
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
||||
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
|
||||
@@ -647,11 +647,11 @@ class SkyReelsV2DiffusionForcingPipeline(DiffusionPipeline, SkyReelsV2LoraLoader
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, defaults to `6.0`):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality. (**6.0 for T2V**, **5.0 for I2V**)
|
||||
Guidance scale as defined in [Classifier-Free Diffusion
|
||||
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
||||
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
||||
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
||||
the text `prompt`, usually at the expense of lower image quality. (**6.0 for T2V**, **5.0 for I2V**)
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
|
||||
@@ -698,11 +698,11 @@ class SkyReelsV2DiffusionForcingImageToVideoPipeline(DiffusionPipeline, SkyReels
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, defaults to `5.0`):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality. (**6.0 for T2V**, **5.0 for I2V**)
|
||||
Guidance scale as defined in [Classifier-Free Diffusion
|
||||
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
||||
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
||||
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
||||
the text `prompt`, usually at the expense of lower image quality. (**6.0 for T2V**, **5.0 for I2V**)
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
|
||||
@@ -524,11 +524,11 @@ class SkyReelsV2ImageToVideoPipeline(DiffusionPipeline, SkyReelsV2LoraLoaderMixi
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, defaults to `5.0`):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
Guidance scale as defined in [Classifier-Free Diffusion
|
||||
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
||||
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
||||
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
|
||||
@@ -23,6 +23,7 @@ except OptionalDependencyNotAvailable:
|
||||
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
|
||||
else:
|
||||
_import_structure["pipeline_wan"] = ["WanPipeline"]
|
||||
_import_structure["pipeline_wan_animate"] = ["WanAnimatePipeline"]
|
||||
_import_structure["pipeline_wan_i2v"] = ["WanImageToVideoPipeline"]
|
||||
_import_structure["pipeline_wan_vace"] = ["WanVACEPipeline"]
|
||||
_import_structure["pipeline_wan_video2video"] = ["WanVideoToVideoPipeline"]
|
||||
@@ -35,10 +36,10 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
from ...utils.dummy_torch_and_transformers_objects import *
|
||||
else:
|
||||
from .pipeline_wan import WanPipeline
|
||||
from .pipeline_wan_animate import WanAnimatePipeline
|
||||
from .pipeline_wan_i2v import WanImageToVideoPipeline
|
||||
from .pipeline_wan_vace import WanVACEPipeline
|
||||
from .pipeline_wan_video2video import WanVideoToVideoPipeline
|
||||
|
||||
else:
|
||||
import sys
|
||||
|
||||
|
||||
@@ -0,0 +1,185 @@
|
||||
# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import PIL.Image
|
||||
import torch
|
||||
|
||||
from ...configuration_utils import register_to_config
|
||||
from ...image_processor import VaeImageProcessor
|
||||
from ...utils import PIL_INTERPOLATION
|
||||
|
||||
|
||||
class WanAnimateImageProcessor(VaeImageProcessor):
|
||||
r"""
|
||||
Image processor to preprocess the reference (character) image for the Wan Animate model.
|
||||
|
||||
Args:
|
||||
do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept
|
||||
`height` and `width` arguments from [`image_processor.VaeImageProcessor.preprocess`] method.
|
||||
vae_scale_factor (`int`, *optional*, defaults to `8`):
|
||||
VAE (spatial) scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of
|
||||
this factor.
|
||||
vae_latent_channels (`int`, *optional*, defaults to `16`):
|
||||
VAE latent channels.
|
||||
spatial_patch_size (`Tuple[int, int]`, *optional*, defaults to `(2, 2)`):
|
||||
The spatial patch size used by the diffusion transformer. For Wan models, this is typically (2, 2).
|
||||
resample (`str`, *optional*, defaults to `lanczos`):
|
||||
Resampling filter to use when resizing the image.
|
||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the image to [-1,1].
|
||||
do_binarize (`bool`, *optional*, defaults to `False`):
|
||||
Whether to binarize the image to 0/1.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to be `False`):
|
||||
Whether to convert the images to RGB format.
|
||||
do_convert_grayscale (`bool`, *optional*, defaults to be `False`):
|
||||
Whether to convert the images to grayscale format.
|
||||
fill_color (`str` or `float` or `Tuple[float, ...]`, *optional*, defaults to `None`):
|
||||
An optional fill color when `resize_mode` is set to `"fill"`. This will fill the empty space with that
|
||||
color instead of filling with data from the image. Any valid `color` argument to `PIL.Image.new` is valid;
|
||||
if `None`, will default to filling with data from `image`.
|
||||
"""
|
||||
|
||||
@register_to_config
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
vae_scale_factor: int = 8,
|
||||
vae_latent_channels: int = 16,
|
||||
spatial_patch_size: Tuple[int, int] = (2, 2),
|
||||
resample: str = "lanczos",
|
||||
reducing_gap: int = None,
|
||||
do_normalize: bool = True,
|
||||
do_binarize: bool = False,
|
||||
do_convert_rgb: bool = False,
|
||||
do_convert_grayscale: bool = False,
|
||||
fill_color: Optional[Union[str, float, Tuple[float, ...]]] = 0,
|
||||
):
|
||||
super().__init__()
|
||||
if do_convert_rgb and do_convert_grayscale:
|
||||
raise ValueError(
|
||||
"`do_convert_rgb` and `do_convert_grayscale` can not both be set to `True`,"
|
||||
" if you intended to convert the image into RGB format, please set `do_convert_grayscale = False`.",
|
||||
" if you intended to convert the image into grayscale format, please set `do_convert_rgb = False`",
|
||||
)
|
||||
|
||||
def _resize_and_fill(
|
||||
self,
|
||||
image: PIL.Image.Image,
|
||||
width: int,
|
||||
height: int,
|
||||
) -> PIL.Image.Image:
|
||||
r"""
|
||||
Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center
|
||||
the image within the dimensions, filling empty with data from image.
|
||||
|
||||
Args:
|
||||
image (`PIL.Image.Image`):
|
||||
The image to resize and fill.
|
||||
width (`int`):
|
||||
The width to resize the image to.
|
||||
height (`int`):
|
||||
The height to resize the image to.
|
||||
|
||||
Returns:
|
||||
`PIL.Image.Image`:
|
||||
The resized and filled image.
|
||||
"""
|
||||
|
||||
ratio = width / height
|
||||
src_ratio = image.width / image.height
|
||||
fill_with_image_data = self.config.fill_color is None
|
||||
fill_color = self.config.fill_color or 0
|
||||
|
||||
src_w = width if ratio < src_ratio else image.width * height // image.height
|
||||
src_h = height if ratio >= src_ratio else image.height * width // image.width
|
||||
|
||||
resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION[self.config.resample])
|
||||
res = PIL.Image.new("RGB", (width, height), color=fill_color)
|
||||
res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
|
||||
|
||||
if fill_with_image_data:
|
||||
if ratio < src_ratio:
|
||||
fill_height = height // 2 - src_h // 2
|
||||
if fill_height > 0:
|
||||
res.paste(resized.resize((width, fill_height), box=(0, 0, width, 0)), box=(0, 0))
|
||||
res.paste(
|
||||
resized.resize((width, fill_height), box=(0, resized.height, width, resized.height)),
|
||||
box=(0, fill_height + src_h),
|
||||
)
|
||||
elif ratio > src_ratio:
|
||||
fill_width = width // 2 - src_w // 2
|
||||
if fill_width > 0:
|
||||
res.paste(resized.resize((fill_width, height), box=(0, 0, 0, height)), box=(0, 0))
|
||||
res.paste(
|
||||
resized.resize((fill_width, height), box=(resized.width, 0, resized.width, height)),
|
||||
box=(fill_width + src_w, 0),
|
||||
)
|
||||
|
||||
return res
|
||||
|
||||
def get_default_height_width(
|
||||
self,
|
||||
image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
) -> Tuple[int, int]:
|
||||
r"""
|
||||
Returns the height and width of the image, downscaled to the next integer multiple of `vae_scale_factor`.
|
||||
|
||||
Args:
|
||||
image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
|
||||
The image input, which can be a PIL image, NumPy array, or PyTorch tensor. If it is a NumPy array, it
|
||||
should have shape `[batch, height, width]` or `[batch, height, width, channels]`. If it is a PyTorch
|
||||
tensor, it should have shape `[batch, channels, height, width]`.
|
||||
height (`Optional[int]`, *optional*, defaults to `None`):
|
||||
The height of the preprocessed image. If `None`, the height of the `image` input will be used.
|
||||
width (`Optional[int]`, *optional*, defaults to `None`):
|
||||
The width of the preprocessed image. If `None`, the width of the `image` input will be used.
|
||||
|
||||
Returns:
|
||||
`Tuple[int, int]`:
|
||||
A tuple containing the height and width, both resized to the nearest integer multiple of
|
||||
`vae_scale_factor * spatial_patch_size`.
|
||||
"""
|
||||
|
||||
if height is None:
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
height = image.height
|
||||
elif isinstance(image, torch.Tensor):
|
||||
height = image.shape[2]
|
||||
else:
|
||||
height = image.shape[1]
|
||||
|
||||
if width is None:
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
width = image.width
|
||||
elif isinstance(image, torch.Tensor):
|
||||
width = image.shape[3]
|
||||
else:
|
||||
width = image.shape[2]
|
||||
|
||||
max_area = width * height
|
||||
aspect_ratio = height / width
|
||||
mod_value_h = self.config.vae_scale_factor * self.config.spatial_patch_size[0]
|
||||
mod_value_w = self.config.vae_scale_factor * self.config.spatial_patch_size[1]
|
||||
|
||||
# Try to preserve the aspect ratio
|
||||
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value_h * mod_value_h
|
||||
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value_w * mod_value_w
|
||||
|
||||
return height, width
|
||||
File diff suppressed because it is too large
Load Diff
@@ -758,11 +758,11 @@ class WanVACEPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, defaults to `5.0`):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
Guidance scale as defined in [Classifier-Free Diffusion
|
||||
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
||||
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
||||
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
guidance_scale_2 (`float`, *optional*, defaults to `None`):
|
||||
Guidance scale for the low-noise stage transformer (`transformer_2`). If `None` and the pipeline's
|
||||
`boundary_ratio` is not None, uses the same value as `guidance_scale`. Only used when `transformer_2`
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
@@ -9,13 +9,48 @@ from ..utils import BaseOutput
|
||||
from .scheduling_utils import SchedulerMixin
|
||||
|
||||
|
||||
def gumbel_noise(t, generator=None):
|
||||
def gumbel_noise(t: torch.Tensor, generator: Optional[torch.Generator] = None) -> torch.Tensor:
|
||||
"""
|
||||
Generate Gumbel noise for sampling.
|
||||
|
||||
Args:
|
||||
t (`torch.Tensor`):
|
||||
Input tensor to match the shape and dtype of the output noise.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator for reproducible sampling.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
Gumbel-distributed noise with the same shape, dtype, and device as the input tensor.
|
||||
"""
|
||||
device = generator.device if generator is not None else t.device
|
||||
noise = torch.zeros_like(t, device=device).uniform_(0, 1, generator=generator).to(t.device)
|
||||
return -torch.log((-torch.log(noise.clamp(1e-20))).clamp(1e-20))
|
||||
|
||||
|
||||
def mask_by_random_topk(mask_len, probs, temperature=1.0, generator=None):
|
||||
def mask_by_random_topk(
|
||||
mask_len: torch.Tensor,
|
||||
probs: torch.Tensor,
|
||||
temperature: float = 1.0,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Mask tokens by selecting the top-k lowest confidence scores with temperature-based randomness.
|
||||
|
||||
Args:
|
||||
mask_len (`torch.Tensor`):
|
||||
Number of tokens to mask per sample in the batch.
|
||||
probs (`torch.Tensor`):
|
||||
Probability scores for each token.
|
||||
temperature (`float`, *optional*, defaults to 1.0):
|
||||
Temperature parameter for controlling randomness in the masking process.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator for reproducible sampling.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
Boolean mask indicating which tokens should be masked.
|
||||
"""
|
||||
confidence = torch.log(probs.clamp(1e-20)) + temperature * gumbel_noise(probs, generator=generator)
|
||||
sorted_confidence = torch.sort(confidence, dim=-1).values
|
||||
cut_off = torch.gather(sorted_confidence, 1, mask_len.long())
|
||||
@@ -29,28 +64,46 @@ class AmusedSchedulerOutput(BaseOutput):
|
||||
Output class for the scheduler's `step` function output.
|
||||
|
||||
Args:
|
||||
prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
|
||||
Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
|
||||
denoising loop.
|
||||
pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
|
||||
The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
|
||||
`pred_original_sample` can be used to preview progress or for guidance.
|
||||
prev_sample (`torch.LongTensor` of shape `(batch_size, height, width)` or `(batch_size, sequence_length)`):
|
||||
Computed sample `(x_{t-1})` of previous timestep with token IDs. `prev_sample` should be used as next model
|
||||
input in the denoising loop.
|
||||
pred_original_sample (`torch.LongTensor` of shape `(batch_size, height, width)` or `(batch_size, sequence_length)`, *optional*):
|
||||
The predicted fully denoised sample `(x_{0})` with token IDs based on the model output from the current
|
||||
timestep. `pred_original_sample` can be used to preview progress or for guidance.
|
||||
"""
|
||||
|
||||
prev_sample: torch.Tensor
|
||||
pred_original_sample: torch.Tensor = None
|
||||
pred_original_sample: Optional[torch.Tensor] = None
|
||||
|
||||
|
||||
class AmusedScheduler(SchedulerMixin, ConfigMixin):
|
||||
"""
|
||||
A scheduler for masked token generation as used in [`AmusedPipeline`].
|
||||
|
||||
This scheduler iteratively unmasks tokens based on their confidence scores, following either a cosine or linear
|
||||
schedule. Unlike traditional diffusion schedulers that work with continuous pixel values, this scheduler operates
|
||||
on discrete token IDs, making it suitable for autoregressive and non-autoregressive masked token generation models.
|
||||
|
||||
This scheduler inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the
|
||||
generic methods the library implements for all schedulers such as loading and saving.
|
||||
|
||||
Args:
|
||||
mask_token_id (`int`):
|
||||
The token ID used to represent masked tokens in the sequence.
|
||||
masking_schedule (`Literal["cosine", "linear"]`, *optional*, defaults to `"cosine"`):
|
||||
The schedule type for determining the mask ratio at each timestep. Can be either `"cosine"` or `"linear"`.
|
||||
"""
|
||||
|
||||
order = 1
|
||||
|
||||
temperatures: torch.Tensor
|
||||
temperatures: Optional[torch.Tensor]
|
||||
timesteps: Optional[torch.Tensor]
|
||||
|
||||
@register_to_config
|
||||
def __init__(
|
||||
self,
|
||||
mask_token_id: int,
|
||||
masking_schedule: str = "cosine",
|
||||
masking_schedule: Literal["cosine", "linear"] = "cosine",
|
||||
):
|
||||
self.temperatures = None
|
||||
self.timesteps = None
|
||||
@@ -58,9 +111,23 @@ class AmusedScheduler(SchedulerMixin, ConfigMixin):
|
||||
def set_timesteps(
|
||||
self,
|
||||
num_inference_steps: int,
|
||||
temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),
|
||||
device: Union[str, torch.device] = None,
|
||||
):
|
||||
temperature: Union[float, Tuple[float, float], List[float]] = (2, 0),
|
||||
device: Optional[Union[str, torch.device]] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Set the discrete timesteps used for the diffusion chain (to be run before inference).
|
||||
|
||||
Args:
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model.
|
||||
temperature (`Union[float, Tuple[float, float], List[float]]`, *optional*, defaults to `(2, 0)`):
|
||||
Temperature parameter(s) for controlling the randomness of sampling. If a tuple or list is provided,
|
||||
temperatures will be linearly interpolated between the first and second values across all timesteps. If
|
||||
a single value is provided, temperatures will be linearly interpolated from that value to 0.01.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps and temperatures should be moved to. If `None`, the timesteps are not
|
||||
moved.
|
||||
"""
|
||||
self.timesteps = torch.arange(num_inference_steps, device=device).flip(0)
|
||||
|
||||
if isinstance(temperature, (tuple, list)):
|
||||
@@ -71,12 +138,38 @@ class AmusedScheduler(SchedulerMixin, ConfigMixin):
|
||||
def step(
|
||||
self,
|
||||
model_output: torch.Tensor,
|
||||
timestep: torch.long,
|
||||
timestep: int,
|
||||
sample: torch.LongTensor,
|
||||
starting_mask_ratio: int = 1,
|
||||
starting_mask_ratio: float = 1.0,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
return_dict: bool = True,
|
||||
) -> Union[AmusedSchedulerOutput, Tuple]:
|
||||
) -> Union[AmusedSchedulerOutput, Tuple[torch.Tensor, torch.Tensor]]:
|
||||
"""
|
||||
Predict the sample at the previous timestep by masking tokens based on confidence scores.
|
||||
|
||||
Args:
|
||||
model_output (`torch.Tensor`):
|
||||
The direct output from the learned diffusion model. Typically of shape `(batch_size, num_tokens,
|
||||
codebook_size)` or `(batch_size, codebook_size, height, width)` for 2D inputs.
|
||||
timestep (`int`):
|
||||
The current discrete timestep in the diffusion chain.
|
||||
sample (`torch.LongTensor`):
|
||||
A current instance of a sample created by the diffusion process. Contains token IDs, with masked
|
||||
positions indicated by `mask_token_id`.
|
||||
starting_mask_ratio (`float`, *optional*, defaults to 1.0):
|
||||
A multiplier applied to the mask ratio schedule. Values less than 1.0 will result in fewer tokens being
|
||||
masked at each step.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator for reproducible sampling.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether to return an [`~schedulers.scheduling_amused.AmusedSchedulerOutput`] or a plain tuple.
|
||||
|
||||
Returns:
|
||||
[`~schedulers.scheduling_amused.AmusedSchedulerOutput`] or `tuple`:
|
||||
If `return_dict` is `True`, [`~schedulers.scheduling_amused.AmusedSchedulerOutput`] is returned,
|
||||
otherwise a tuple is returned where the first element is the sample tensor (`prev_sample`) and the
|
||||
second element is the predicted original sample tensor (`pred_original_sample`).
|
||||
"""
|
||||
two_dim_input = sample.ndim == 3 and model_output.ndim == 4
|
||||
|
||||
if two_dim_input:
|
||||
@@ -137,7 +230,27 @@ class AmusedScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
return AmusedSchedulerOutput(prev_sample, pred_original_sample)
|
||||
|
||||
def add_noise(self, sample, timesteps, generator=None):
|
||||
def add_noise(
|
||||
self,
|
||||
sample: torch.LongTensor,
|
||||
timesteps: int,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
) -> torch.LongTensor:
|
||||
"""
|
||||
Add noise to a sample by randomly masking tokens according to the masking schedule.
|
||||
|
||||
Args:
|
||||
sample (`torch.LongTensor`):
|
||||
The input sample containing token IDs to be partially masked.
|
||||
timesteps (`int`):
|
||||
The timestep that determines how much masking to apply. Higher timesteps result in more masking.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator for reproducible masking.
|
||||
|
||||
Returns:
|
||||
`torch.LongTensor`:
|
||||
The sample with some tokens replaced by `mask_token_id` according to the masking schedule.
|
||||
"""
|
||||
step_idx = (self.timesteps == timesteps).nonzero()
|
||||
ratio = (step_idx + 1) / len(self.timesteps)
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple, Union
|
||||
from typing import Literal, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
@@ -12,10 +12,10 @@ from .scheduling_utils import SchedulerMixin
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -23,16 +23,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
|
||||
@@ -121,7 +121,7 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -287,7 +287,23 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
|
||||
return c_skip, c_out
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -302,7 +318,14 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep):
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -410,6 +433,21 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -137,7 +137,7 @@ class CosineDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -266,6 +266,19 @@ class CosineDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -537,6 +550,21 @@ class CosineDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -49,10 +49,10 @@ class DDIMSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -92,17 +93,17 @@ def betas_for_alpha_bar(
|
||||
return torch.tensor(betas, dtype=torch.float32)
|
||||
|
||||
|
||||
def rescale_zero_terminal_snr(betas):
|
||||
def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
the betas that the scheduler is being initialized with.
|
||||
The betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -143,9 +144,9 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
The starting `beta` value of inference.
|
||||
beta_end (`float`, defaults to 0.02):
|
||||
The final `beta` value.
|
||||
beta_schedule (`str`, defaults to `"linear"`):
|
||||
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
|
||||
`linear`, `scaled_linear`, or `squaredcos_cap_v2`.
|
||||
beta_schedule (`Literal["linear", "scaled_linear", "squaredcos_cap_v2"]`, defaults to `"linear"`):
|
||||
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Must be one
|
||||
of `"linear"`, `"scaled_linear"`, or `"squaredcos_cap_v2"`.
|
||||
trained_betas (`np.ndarray`, *optional*):
|
||||
Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
|
||||
clip_sample (`bool`, defaults to `True`):
|
||||
@@ -158,10 +159,10 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
otherwise it uses the alpha value at step 0.
|
||||
steps_offset (`int`, defaults to 0):
|
||||
An offset added to the inference steps, as required by some model families.
|
||||
prediction_type (`str`, defaults to `epsilon`, *optional*):
|
||||
Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
|
||||
`sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
|
||||
Video](https://imagen.research.google/video/paper.pdf) paper).
|
||||
prediction_type (`Literal["epsilon", "sample", "v_prediction"]`, defaults to `"epsilon"`):
|
||||
Prediction type of the scheduler function. Must be one of `"epsilon"` (predicts the noise of the diffusion
|
||||
process), `"sample"` (directly predicts the noisy sample), or `"v_prediction"` (see section 2.4 of [Imagen
|
||||
Video](https://huggingface.co/papers/2210.02303) paper).
|
||||
thresholding (`bool`, defaults to `False`):
|
||||
Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
|
||||
as Stable Diffusion.
|
||||
@@ -169,9 +170,10 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
|
||||
sample_max_value (`float`, defaults to 1.0):
|
||||
The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
|
||||
timestep_spacing (`str`, defaults to `"leading"`):
|
||||
The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
|
||||
Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
|
||||
timestep_spacing (`Literal["leading", "trailing", "linspace"]`, defaults to `"leading"`):
|
||||
The way the timesteps should be scaled. Must be one of `"leading"`, `"trailing"`, or `"linspace"`. Refer to
|
||||
Table 2 of the [Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://huggingface.co/papers/2305.08891) for more information.
|
||||
rescale_betas_zero_snr (`bool`, defaults to `False`):
|
||||
Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
|
||||
dark samples instead of limiting it to samples with medium brightness. Loosely related to
|
||||
@@ -187,17 +189,17 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
num_train_timesteps: int = 1000,
|
||||
beta_start: float = 0.0001,
|
||||
beta_end: float = 0.02,
|
||||
beta_schedule: str = "linear",
|
||||
beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
|
||||
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
||||
clip_sample: bool = True,
|
||||
set_alpha_to_one: bool = True,
|
||||
steps_offset: int = 0,
|
||||
prediction_type: str = "epsilon",
|
||||
prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
|
||||
thresholding: bool = False,
|
||||
dynamic_thresholding_ratio: float = 0.995,
|
||||
clip_sample_range: float = 1.0,
|
||||
sample_max_value: float = 1.0,
|
||||
timestep_spacing: str = "leading",
|
||||
timestep_spacing: Literal["leading", "trailing", "linspace"] = "leading",
|
||||
rescale_betas_zero_snr: bool = False,
|
||||
):
|
||||
if trained_betas is not None:
|
||||
@@ -250,7 +252,25 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
"""
|
||||
return sample
|
||||
|
||||
def _get_variance(self, timestep, prev_timestep):
|
||||
def _get_variance(self, timestep: int, prev_timestep: int) -> torch.Tensor:
|
||||
"""
|
||||
Computes the variance of the noise added at a given diffusion step.
|
||||
|
||||
For a given `timestep` and its previous step, this method calculates the variance as defined in DDIM/DDPM
|
||||
literature:
|
||||
var_t = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
|
||||
where alpha_prod and beta_prod are cumulative products of alphas and betas, respectively.
|
||||
|
||||
Args:
|
||||
timestep (`int`):
|
||||
The current timestep in the diffusion process.
|
||||
prev_timestep (`int`):
|
||||
The previous timestep in the diffusion process. If negative, uses `final_alpha_cumprod`.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The variance for the current timestep.
|
||||
"""
|
||||
alpha_prod_t = self.alphas_cumprod[timestep]
|
||||
alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
|
||||
beta_prod_t = 1 - alpha_prod_t
|
||||
@@ -263,6 +283,8 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -270,6 +292,14 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -294,13 +324,18 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
return sample
|
||||
|
||||
def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
|
||||
def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None) -> None:
|
||||
"""
|
||||
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
|
||||
|
||||
Args:
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model.
|
||||
device (`Union[str, torch.device]`, *optional*):
|
||||
The device to use for the timesteps.
|
||||
|
||||
Raises:
|
||||
ValueError: If `num_inference_steps` is larger than `self.config.num_train_timesteps`.
|
||||
"""
|
||||
|
||||
if num_inference_steps > self.config.num_train_timesteps:
|
||||
@@ -346,7 +381,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
sample: torch.Tensor,
|
||||
eta: float = 0.0,
|
||||
use_clipped_model_output: bool = False,
|
||||
generator=None,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
variance_noise: Optional[torch.Tensor] = None,
|
||||
return_dict: bool = True,
|
||||
) -> Union[DDIMSchedulerOutput, Tuple]:
|
||||
@@ -357,20 +392,21 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
Args:
|
||||
model_output (`torch.Tensor`):
|
||||
The direct output from learned diffusion model.
|
||||
timestep (`float`):
|
||||
timestep (`int`):
|
||||
The current discrete timestep in the diffusion chain.
|
||||
sample (`torch.Tensor`):
|
||||
A current instance of a sample created by the diffusion process.
|
||||
eta (`float`):
|
||||
The weight of noise for added noise in diffusion step.
|
||||
use_clipped_model_output (`bool`, defaults to `False`):
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
The weight of noise for added noise in diffusion step. A value of 0 corresponds to DDIM (deterministic)
|
||||
and 1 corresponds to DDPM (fully stochastic).
|
||||
use_clipped_model_output (`bool`, *optional*, defaults to `False`):
|
||||
If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
|
||||
because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
|
||||
clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
|
||||
`use_clipped_model_output` has no effect.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator.
|
||||
variance_noise (`torch.Tensor`):
|
||||
A random number generator for reproducible sampling.
|
||||
variance_noise (`torch.Tensor`, *optional*):
|
||||
Alternative to generating noise with `generator` by directly providing the noise for the variance
|
||||
itself. Useful for methods such as [`CycleDiffusion`].
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
@@ -477,6 +513,22 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
@@ -499,6 +551,21 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction from the sample and noise according to the velocity formula.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps for velocity computation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed velocity.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as sample
|
||||
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
|
||||
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
|
||||
@@ -517,5 +584,5 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
|
||||
return velocity
|
||||
|
||||
def __len__(self):
|
||||
def __len__(self) -> int:
|
||||
return self.config.num_train_timesteps
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -49,10 +49,10 @@ class DDIMSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -408,6 +409,22 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
@@ -430,6 +447,21 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction from the sample and noise according to the velocity formula.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps for velocity computation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed velocity.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as sample
|
||||
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
|
||||
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
# and https://github.com/hojonathanho/diffusion
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -47,10 +47,10 @@ class DDIMSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -58,16 +58,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -95,13 +96,13 @@ def rescale_zero_terminal_snr(betas):
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
the betas that the scheduler is being initialized with.
|
||||
The betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -49,10 +49,10 @@ class DDIMParallelSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -97,13 +98,13 @@ def rescale_zero_terminal_snr(betas):
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
the betas that the scheduler is being initialized with.
|
||||
The betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -194,17 +195,17 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
num_train_timesteps: int = 1000,
|
||||
beta_start: float = 0.0001,
|
||||
beta_end: float = 0.02,
|
||||
beta_schedule: str = "linear",
|
||||
beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
|
||||
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
||||
clip_sample: bool = True,
|
||||
set_alpha_to_one: bool = True,
|
||||
steps_offset: int = 0,
|
||||
prediction_type: str = "epsilon",
|
||||
prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
|
||||
thresholding: bool = False,
|
||||
dynamic_thresholding_ratio: float = 0.995,
|
||||
clip_sample_range: float = 1.0,
|
||||
sample_max_value: float = 1.0,
|
||||
timestep_spacing: str = "leading",
|
||||
timestep_spacing: Literal["leading", "trailing", "linspace"] = "leading",
|
||||
rescale_betas_zero_snr: bool = False,
|
||||
):
|
||||
if trained_betas is not None:
|
||||
@@ -285,6 +286,8 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -292,6 +295,14 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -324,6 +335,11 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
Args:
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model.
|
||||
device (`Union[str, torch.device]`, *optional*):
|
||||
The device to use for the timesteps.
|
||||
|
||||
Raises:
|
||||
ValueError: If `num_inference_steps` is larger than `self.config.num_train_timesteps`.
|
||||
"""
|
||||
|
||||
if num_inference_steps > self.config.num_train_timesteps:
|
||||
@@ -602,6 +618,22 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
@@ -624,6 +656,21 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction from the sample and noise according to the velocity formula.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps for velocity computation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed velocity.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as sample
|
||||
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
|
||||
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -46,10 +46,10 @@ class DDPMSchedulerOutput(BaseOutput):
|
||||
|
||||
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -57,16 +57,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -90,17 +91,17 @@ def betas_for_alpha_bar(
|
||||
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
|
||||
def rescale_zero_terminal_snr(betas):
|
||||
def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
the betas that the scheduler is being initialized with.
|
||||
The betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -134,39 +135,37 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
methods the library implements for all schedulers such as loading and saving.
|
||||
|
||||
Args:
|
||||
num_train_timesteps (`int`, defaults to 1000):
|
||||
num_train_timesteps (`int`, defaults to `1000`):
|
||||
The number of diffusion steps to train the model.
|
||||
beta_start (`float`, defaults to 0.0001):
|
||||
beta_start (`float`, defaults to `0.0001`):
|
||||
The starting `beta` value of inference.
|
||||
beta_end (`float`, defaults to 0.02):
|
||||
beta_end (`float`, defaults to `0.02`):
|
||||
The final `beta` value.
|
||||
beta_schedule (`str`, defaults to `"linear"`):
|
||||
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
|
||||
`linear`, `scaled_linear`, `squaredcos_cap_v2`, or `sigmoid`.
|
||||
beta_schedule (`"linear"`, `"scaled_linear"`, `"squaredcos_cap_v2"`, or `"sigmoid"`, defaults to `"linear"`):
|
||||
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model.
|
||||
trained_betas (`np.ndarray`, *optional*):
|
||||
An array of betas to pass directly to the constructor without using `beta_start` and `beta_end`.
|
||||
variance_type (`str`, defaults to `"fixed_small"`):
|
||||
Clip the variance when adding noise to the denoised sample. Choose from `fixed_small`, `fixed_small_log`,
|
||||
`fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
|
||||
variance_type (`"fixed_small"`, `"fixed_small_log"`, `"fixed_large"`, `"fixed_large_log"`, `"learned"`, or `"learned_range"`, defaults to `"fixed_small"`):
|
||||
Clip the variance when adding noise to the denoised sample.
|
||||
clip_sample (`bool`, defaults to `True`):
|
||||
Clip the predicted sample for numerical stability.
|
||||
clip_sample_range (`float`, defaults to 1.0):
|
||||
clip_sample_range (`float`, defaults to `1.0`):
|
||||
The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
|
||||
prediction_type (`str`, defaults to `epsilon`, *optional*):
|
||||
prediction_type (`"epsilon"`, `"sample"`, or `"v_prediction"`, defaults to `"epsilon"`):
|
||||
Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
|
||||
`sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
|
||||
Video](https://imagen.research.google/video/paper.pdf) paper).
|
||||
thresholding (`bool`, defaults to `False`):
|
||||
Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
|
||||
as Stable Diffusion.
|
||||
dynamic_thresholding_ratio (`float`, defaults to 0.995):
|
||||
dynamic_thresholding_ratio (`float`, defaults to `0.995`):
|
||||
The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
|
||||
sample_max_value (`float`, defaults to 1.0):
|
||||
sample_max_value (`float`, defaults to `1.0`):
|
||||
The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
|
||||
timestep_spacing (`str`, defaults to `"leading"`):
|
||||
timestep_spacing (`"linspace"`, `"leading"`, or `"trailing"`, defaults to `"leading"`):
|
||||
The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
|
||||
Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
|
||||
steps_offset (`int`, defaults to 0):
|
||||
steps_offset (`int`, defaults to `0`):
|
||||
An offset added to the inference steps, as required by some model families.
|
||||
rescale_betas_zero_snr (`bool`, defaults to `False`):
|
||||
Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
|
||||
@@ -183,16 +182,18 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
num_train_timesteps: int = 1000,
|
||||
beta_start: float = 0.0001,
|
||||
beta_end: float = 0.02,
|
||||
beta_schedule: str = "linear",
|
||||
beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2", "sigmoid"] = "linear",
|
||||
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
||||
variance_type: str = "fixed_small",
|
||||
variance_type: Literal[
|
||||
"fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"
|
||||
] = "fixed_small",
|
||||
clip_sample: bool = True,
|
||||
prediction_type: str = "epsilon",
|
||||
prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
|
||||
thresholding: bool = False,
|
||||
dynamic_thresholding_ratio: float = 0.995,
|
||||
clip_sample_range: float = 1.0,
|
||||
sample_max_value: float = 1.0,
|
||||
timestep_spacing: str = "leading",
|
||||
timestep_spacing: Literal["linspace", "leading", "trailing"] = "leading",
|
||||
steps_offset: int = 0,
|
||||
rescale_betas_zero_snr: bool = False,
|
||||
):
|
||||
@@ -322,7 +323,31 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
self.timesteps = torch.from_numpy(timesteps).to(device)
|
||||
|
||||
def _get_variance(self, t, predicted_variance=None, variance_type=None):
|
||||
def _get_variance(
|
||||
self,
|
||||
t: int,
|
||||
predicted_variance: Optional[torch.Tensor] = None,
|
||||
variance_type: Optional[
|
||||
Literal["fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"]
|
||||
] = None,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Compute the variance for a given timestep according to the specified variance type.
|
||||
|
||||
Args:
|
||||
t (`int`):
|
||||
The current timestep.
|
||||
predicted_variance (`torch.Tensor`, *optional*):
|
||||
The predicted variance from the model. Used only when `variance_type` is `"learned"` or
|
||||
`"learned_range"`.
|
||||
variance_type (`"fixed_small"`, `"fixed_small_log"`, `"fixed_large"`, `"fixed_large_log"`, `"learned"`, or `"learned_range"`, *optional*):
|
||||
The type of variance to compute. If `None`, uses the variance type specified in the scheduler
|
||||
configuration.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed variance.
|
||||
"""
|
||||
prev_t = self.previous_timestep(t)
|
||||
|
||||
alpha_prod_t = self.alphas_cumprod[t]
|
||||
@@ -364,6 +389,8 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -371,6 +398,14 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -400,7 +435,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
model_output: torch.Tensor,
|
||||
timestep: int,
|
||||
sample: torch.Tensor,
|
||||
generator=None,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
return_dict: bool = True,
|
||||
) -> Union[DDPMSchedulerOutput, Tuple]:
|
||||
"""
|
||||
@@ -410,20 +445,19 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
Args:
|
||||
model_output (`torch.Tensor`):
|
||||
The direct output from learned diffusion model.
|
||||
timestep (`float`):
|
||||
timestep (`int`):
|
||||
The current discrete timestep in the diffusion chain.
|
||||
sample (`torch.Tensor`):
|
||||
A current instance of a sample created by the diffusion process.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
return_dict (`bool`, defaults to `True`):
|
||||
Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`.
|
||||
|
||||
Returns:
|
||||
[`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`:
|
||||
If return_dict is `True`, [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] is returned, otherwise a
|
||||
tuple is returned where the first element is the sample tensor.
|
||||
|
||||
"""
|
||||
t = timestep
|
||||
|
||||
@@ -504,6 +538,22 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
@@ -525,6 +575,21 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
return noisy_samples
|
||||
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction from the sample and noise according to the velocity formula.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps for velocity computation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed velocity.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as sample
|
||||
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
|
||||
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
|
||||
@@ -543,10 +608,21 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
|
||||
return velocity
|
||||
|
||||
def __len__(self):
|
||||
def __len__(self) -> int:
|
||||
return self.config.num_train_timesteps
|
||||
|
||||
def previous_timestep(self, timestep):
|
||||
def previous_timestep(self, timestep: int) -> int:
|
||||
"""
|
||||
Compute the previous timestep in the diffusion chain.
|
||||
|
||||
Args:
|
||||
timestep (`int`):
|
||||
The current timestep.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The previous timestep.
|
||||
"""
|
||||
if self.custom_timesteps or self.num_inference_steps:
|
||||
index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
|
||||
if index == self.timesteps.shape[0] - 1:
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -48,10 +48,10 @@ class DDPMParallelSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -59,16 +59,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -96,13 +97,13 @@ def rescale_zero_terminal_snr(betas):
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
the betas that the scheduler is being initialized with.
|
||||
The betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -191,16 +192,18 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
num_train_timesteps: int = 1000,
|
||||
beta_start: float = 0.0001,
|
||||
beta_end: float = 0.02,
|
||||
beta_schedule: str = "linear",
|
||||
beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2", "sigmoid"] = "linear",
|
||||
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
||||
variance_type: str = "fixed_small",
|
||||
variance_type: Literal[
|
||||
"fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"
|
||||
] = "fixed_small",
|
||||
clip_sample: bool = True,
|
||||
prediction_type: str = "epsilon",
|
||||
prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
|
||||
thresholding: bool = False,
|
||||
dynamic_thresholding_ratio: float = 0.995,
|
||||
clip_sample_range: float = 1.0,
|
||||
sample_max_value: float = 1.0,
|
||||
timestep_spacing: str = "leading",
|
||||
timestep_spacing: Literal["linspace", "leading", "trailing"] = "leading",
|
||||
steps_offset: int = 0,
|
||||
rescale_betas_zero_snr: bool = False,
|
||||
):
|
||||
@@ -333,7 +336,31 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
self.timesteps = torch.from_numpy(timesteps).to(device)
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._get_variance
|
||||
def _get_variance(self, t, predicted_variance=None, variance_type=None):
|
||||
def _get_variance(
|
||||
self,
|
||||
t: int,
|
||||
predicted_variance: Optional[torch.Tensor] = None,
|
||||
variance_type: Optional[
|
||||
Literal["fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"]
|
||||
] = None,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Compute the variance for a given timestep according to the specified variance type.
|
||||
|
||||
Args:
|
||||
t (`int`):
|
||||
The current timestep.
|
||||
predicted_variance (`torch.Tensor`, *optional*):
|
||||
The predicted variance from the model. Used only when `variance_type` is `"learned"` or
|
||||
`"learned_range"`.
|
||||
variance_type (`"fixed_small"`, `"fixed_small_log"`, `"fixed_large"`, `"fixed_large_log"`, `"learned"`, or `"learned_range"`, *optional*):
|
||||
The type of variance to compute. If `None`, uses the variance type specified in the scheduler
|
||||
configuration.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed variance.
|
||||
"""
|
||||
prev_t = self.previous_timestep(t)
|
||||
|
||||
alpha_prod_t = self.alphas_cumprod[t]
|
||||
@@ -376,6 +403,8 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -383,6 +412,14 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -593,6 +630,22 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
@@ -615,6 +668,21 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction from the sample and noise according to the velocity formula.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps for velocity computation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed velocity.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as sample
|
||||
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
|
||||
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
|
||||
@@ -638,6 +706,17 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
|
||||
def previous_timestep(self, timestep):
|
||||
"""
|
||||
Compute the previous timestep in the diffusion chain.
|
||||
|
||||
Args:
|
||||
timestep (`int`):
|
||||
The current timestep.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The previous timestep.
|
||||
"""
|
||||
if self.custom_timesteps or self.num_inference_steps:
|
||||
index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
|
||||
if index == self.timesteps.shape[0] - 1:
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
|
||||
|
||||
import math
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -32,10 +32,10 @@ if is_scipy_available():
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -43,16 +43,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -229,7 +230,7 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -320,6 +321,8 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -327,6 +330,14 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -353,6 +364,19 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -388,7 +412,20 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -414,7 +451,19 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""Constructs an exponential noise schedule."""
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -438,7 +487,24 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -50,10 +50,10 @@ class DDIMSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -61,16 +61,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -445,6 +446,22 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
@@ -467,6 +484,21 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction from the sample and noise according to the velocity formula.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps for velocity computation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed velocity.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as sample
|
||||
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
|
||||
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
|
||||
|
||||
import math
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -32,10 +32,10 @@ if is_scipy_available():
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -43,16 +43,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -80,13 +81,13 @@ def rescale_zero_terminal_snr(betas):
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
the betas that the scheduler is being initialized with.
|
||||
The betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -323,7 +324,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -460,6 +461,8 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -467,6 +470,14 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -493,6 +504,19 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -527,7 +551,20 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -566,7 +603,19 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""Constructs an exponential noise schedule."""
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -590,7 +639,24 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
|
||||
|
||||
import math
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -32,10 +32,10 @@ if is_scipy_available():
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -43,16 +43,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -332,6 +333,8 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -339,6 +342,14 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -365,6 +376,19 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -400,7 +424,20 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -426,7 +463,19 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""Constructs an exponential noise schedule."""
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -450,7 +499,24 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -115,10 +115,10 @@ class BrownianTreeNoiseSampler:
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -126,16 +126,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -250,7 +251,23 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
||||
self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -265,7 +282,14 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep):
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -301,7 +325,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -429,6 +453,19 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -467,7 +504,19 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""Constructs an exponential noise schedule."""
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -491,7 +540,24 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -645,6 +711,21 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
|
||||
|
||||
import math
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -34,10 +34,10 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -45,16 +45,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -294,7 +295,7 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -410,6 +411,8 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -417,6 +420,14 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -443,6 +454,19 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -478,7 +502,20 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -504,7 +541,19 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""Constructs an exponential noise schedule."""
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -528,7 +577,24 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
|
||||
@@ -169,7 +169,7 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -299,6 +299,8 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -306,6 +308,14 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -332,6 +342,19 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -672,6 +695,21 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -155,7 +155,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -284,7 +284,23 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
|
||||
return sigmas
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -299,7 +315,14 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep):
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -413,6 +436,21 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -49,10 +49,10 @@ class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -97,13 +98,13 @@ def rescale_zero_terminal_snr(betas):
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
the betas that the scheduler is being initialized with.
|
||||
The betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -245,7 +246,7 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -319,7 +320,23 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -334,7 +351,14 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep):
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -451,6 +475,21 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -52,10 +52,10 @@ class EulerDiscreteSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -63,16 +63,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -96,17 +97,17 @@ def betas_for_alpha_bar(
|
||||
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
|
||||
def rescale_zero_terminal_snr(betas):
|
||||
def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
the betas that the scheduler is being initialized with.
|
||||
The betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -146,17 +147,17 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
The starting `beta` value of inference.
|
||||
beta_end (`float`, defaults to 0.02):
|
||||
The final `beta` value.
|
||||
beta_schedule (`str`, defaults to `"linear"`):
|
||||
beta_schedule (`Literal["linear", "scaled_linear", "squaredcos_cap_v2"]`, defaults to `"linear"`):
|
||||
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
|
||||
`linear` or `scaled_linear`.
|
||||
`"linear"`, `"scaled_linear"`, or `"squaredcos_cap_v2"`.
|
||||
trained_betas (`np.ndarray`, *optional*):
|
||||
Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
|
||||
prediction_type (`str`, defaults to `epsilon`, *optional*):
|
||||
Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
|
||||
`sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
|
||||
prediction_type (`Literal["epsilon", "sample", "v_prediction"]`, defaults to `"epsilon"`, *optional*):
|
||||
Prediction type of the scheduler function; can be `"epsilon"` (predicts the noise of the diffusion
|
||||
process), `"sample"` (directly predicts the noisy sample`) or `"v_prediction"` (see section 2.4 of [Imagen
|
||||
Video](https://imagen.research.google/video/paper.pdf) paper).
|
||||
interpolation_type(`str`, defaults to `"linear"`, *optional*):
|
||||
The interpolation type to compute intermediate sigmas for the scheduler denoising steps. Should be on of
|
||||
interpolation_type (`Literal["linear", "log_linear"]`, defaults to `"linear"`, *optional*):
|
||||
The interpolation type to compute intermediate sigmas for the scheduler denoising steps. Should be one of
|
||||
`"linear"` or `"log_linear"`.
|
||||
use_karras_sigmas (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
|
||||
@@ -166,18 +167,26 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
use_beta_sigmas (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use beta sigmas for step sizes in the noise schedule during the sampling process. Refer to [Beta
|
||||
Sampling is All You Need](https://huggingface.co/papers/2407.12173) for more information.
|
||||
timestep_spacing (`str`, defaults to `"linspace"`):
|
||||
sigma_min (`float`, *optional*):
|
||||
The minimum sigma value for the noise schedule. If not provided, defaults to the last sigma in the
|
||||
schedule.
|
||||
sigma_max (`float`, *optional*):
|
||||
The maximum sigma value for the noise schedule. If not provided, defaults to the first sigma in the
|
||||
schedule.
|
||||
timestep_spacing (`Literal["linspace", "leading", "trailing"]`, defaults to `"linspace"`):
|
||||
The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
|
||||
Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
|
||||
timestep_type (`Literal["discrete", "continuous"]`, defaults to `"discrete"`):
|
||||
The type of timesteps to use. Can be `"discrete"` or `"continuous"`.
|
||||
steps_offset (`int`, defaults to 0):
|
||||
An offset added to the inference steps, as required by some model families.
|
||||
rescale_betas_zero_snr (`bool`, defaults to `False`):
|
||||
Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
|
||||
dark samples instead of limiting it to samples with medium brightness. Loosely related to
|
||||
[`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
|
||||
final_sigmas_type (`str`, defaults to `"zero"`):
|
||||
final_sigmas_type (`Literal["zero", "sigma_min"]`, defaults to `"zero"`):
|
||||
The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
|
||||
sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
|
||||
sigma is the same as the last sigma in the training schedule. If `"zero"`, the final sigma is set to 0.
|
||||
"""
|
||||
|
||||
_compatibles = [e.name for e in KarrasDiffusionSchedulers]
|
||||
@@ -189,20 +198,20 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
num_train_timesteps: int = 1000,
|
||||
beta_start: float = 0.0001,
|
||||
beta_end: float = 0.02,
|
||||
beta_schedule: str = "linear",
|
||||
beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
|
||||
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
||||
prediction_type: str = "epsilon",
|
||||
interpolation_type: str = "linear",
|
||||
prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
|
||||
interpolation_type: Literal["linear", "log_linear"] = "linear",
|
||||
use_karras_sigmas: Optional[bool] = False,
|
||||
use_exponential_sigmas: Optional[bool] = False,
|
||||
use_beta_sigmas: Optional[bool] = False,
|
||||
sigma_min: Optional[float] = None,
|
||||
sigma_max: Optional[float] = None,
|
||||
timestep_spacing: str = "linspace",
|
||||
timestep_type: str = "discrete", # can be "discrete" or "continuous"
|
||||
timestep_spacing: Literal["linspace", "leading", "trailing"] = "linspace",
|
||||
timestep_type: Literal["discrete", "continuous"] = "discrete",
|
||||
steps_offset: int = 0,
|
||||
rescale_betas_zero_snr: bool = False,
|
||||
final_sigmas_type: str = "zero", # can be "zero" or "sigma_min"
|
||||
final_sigmas_type: Literal["zero", "sigma_min"] = "zero",
|
||||
):
|
||||
if self.config.use_beta_sigmas and not is_scipy_available():
|
||||
raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
|
||||
@@ -259,8 +268,15 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication
|
||||
|
||||
@property
|
||||
def init_noise_sigma(self):
|
||||
# standard deviation of the initial noise distribution
|
||||
def init_noise_sigma(self) -> Union[float, torch.Tensor]:
|
||||
"""
|
||||
The standard deviation of the initial noise distribution.
|
||||
|
||||
Returns:
|
||||
`float` or `torch.Tensor`:
|
||||
The standard deviation of the initial noise distribution, computed based on the maximum sigma value and
|
||||
the timestep spacing configuration.
|
||||
"""
|
||||
max_sigma = max(self.sigmas) if isinstance(self.sigmas, list) else self.sigmas.max()
|
||||
if self.config.timestep_spacing in ["linspace", "trailing"]:
|
||||
return max_sigma
|
||||
@@ -268,26 +284,34 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return (max_sigma**2 + 1) ** 0.5
|
||||
|
||||
@property
|
||||
def step_index(self):
|
||||
def step_index(self) -> Optional[int]:
|
||||
"""
|
||||
The index counter for current timestep. It will increase 1 after each scheduler step.
|
||||
The index counter for current timestep. It will increase by 1 after each scheduler step.
|
||||
|
||||
Returns:
|
||||
`int` or `None`:
|
||||
The current step index, or `None` if not initialized.
|
||||
"""
|
||||
return self._step_index
|
||||
|
||||
@property
|
||||
def begin_index(self):
|
||||
def begin_index(self) -> Optional[int]:
|
||||
"""
|
||||
The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
|
||||
|
||||
Returns:
|
||||
`int` or `None`:
|
||||
The begin index for the scheduler, or `None` if not set.
|
||||
"""
|
||||
return self._begin_index
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
|
||||
def set_begin_index(self, begin_index: int = 0):
|
||||
def set_begin_index(self, begin_index: int = 0) -> None:
|
||||
"""
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -299,13 +323,13 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
timestep (`int`, *optional*):
|
||||
The input sample to be scaled.
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep in the diffusion chain.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
A scaled input sample.
|
||||
A scaled input sample, divided by `(sigma**2 + 1) ** 0.5`.
|
||||
"""
|
||||
if self.step_index is None:
|
||||
self._init_step_index(timestep)
|
||||
@@ -318,17 +342,18 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
def set_timesteps(
|
||||
self,
|
||||
num_inference_steps: int = None,
|
||||
device: Union[str, torch.device] = None,
|
||||
num_inference_steps: Optional[int] = None,
|
||||
device: Optional[Union[str, torch.device]] = None,
|
||||
timesteps: Optional[List[int]] = None,
|
||||
sigmas: Optional[List[float]] = None,
|
||||
):
|
||||
) -> None:
|
||||
"""
|
||||
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
|
||||
|
||||
Args:
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model.
|
||||
num_inference_steps (`int`, *optional*):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If `None`,
|
||||
`timesteps` or `sigmas` must be provided.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
@@ -336,10 +361,9 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
based on the `timestep_spacing` attribute. If `timesteps` is passed, `num_inference_steps` and `sigmas`
|
||||
must be `None`, and `timestep_spacing` attribute will be ignored.
|
||||
sigmas (`List[float]`, *optional*):
|
||||
Custom sigmas used to support arbitrary timesteps schedule schedule. If `None`, timesteps and sigmas
|
||||
will be generated based on the relevant scheduler attributes. If `sigmas` is passed,
|
||||
`num_inference_steps` and `timesteps` must be `None`, and the timesteps will be generated based on the
|
||||
custom sigmas schedule.
|
||||
Custom sigmas used to support arbitrary timesteps schedule. If `None`, timesteps and sigmas will be
|
||||
generated based on the relevant scheduler attributes. If `sigmas` is passed, `num_inference_steps` and
|
||||
`timesteps` must be `None`, and the timesteps will be generated based on the custom sigmas schedule.
|
||||
"""
|
||||
|
||||
if timesteps is not None and sigmas is not None:
|
||||
@@ -449,7 +473,20 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
self._begin_index = None
|
||||
self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication
|
||||
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
def _sigma_to_t(self, sigma: np.ndarray, log_sigmas: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -473,8 +510,21 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return t
|
||||
|
||||
# Copied from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -500,7 +550,19 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L26
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""Constructs an exponential noise schedule."""
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -523,7 +585,24 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -551,7 +630,23 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
)
|
||||
return sigmas
|
||||
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -565,7 +660,14 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
return indices[pos].item()
|
||||
|
||||
def _init_step_index(self, timestep):
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -591,26 +693,33 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
Args:
|
||||
model_output (`torch.Tensor`):
|
||||
The direct output from learned diffusion model.
|
||||
timestep (`float`):
|
||||
The direct output from the learned diffusion model.
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current discrete timestep in the diffusion chain.
|
||||
sample (`torch.Tensor`):
|
||||
A current instance of a sample created by the diffusion process.
|
||||
s_churn (`float`):
|
||||
s_tmin (`float`):
|
||||
s_tmax (`float`):
|
||||
s_noise (`float`, defaults to 1.0):
|
||||
s_churn (`float`, *optional*, defaults to `0.0`):
|
||||
Stochasticity parameter that controls the amount of noise added during sampling. Higher values increase
|
||||
randomness.
|
||||
s_tmin (`float`, *optional*, defaults to `0.0`):
|
||||
Minimum timestep threshold for applying stochasticity. Only timesteps above this value will have noise
|
||||
added.
|
||||
s_tmax (`float`, *optional*, defaults to `inf`):
|
||||
Maximum timestep threshold for applying stochasticity. Only timesteps below this value will have noise
|
||||
added.
|
||||
s_noise (`float`, *optional*, defaults to `1.0`):
|
||||
Scaling factor for noise added to the sample.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator.
|
||||
return_dict (`bool`):
|
||||
A random number generator for reproducible sampling.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
|
||||
tuple.
|
||||
|
||||
Returns:
|
||||
[`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
|
||||
If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
|
||||
returned, otherwise a tuple is returned where the first element is the sample tensor.
|
||||
If `return_dict` is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
|
||||
returned, otherwise a tuple is returned where the first element is the sample tensor and the second
|
||||
element is the predicted original sample.
|
||||
"""
|
||||
|
||||
if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
|
||||
@@ -689,6 +798,21 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
@@ -717,6 +841,24 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return noisy_samples
|
||||
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction for the given sample and noise at the specified timesteps.
|
||||
|
||||
This method implements the velocity prediction used in v-prediction models, which predicts a linear combination
|
||||
of the sample and noise.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample for which to compute the velocity.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor corresponding to the sample.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to compute the velocity.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The velocity prediction computed as `sqrt(alpha_prod) * noise - sqrt(1 - alpha_prod) * sample`.
|
||||
"""
|
||||
if (
|
||||
isinstance(timesteps, int)
|
||||
or isinstance(timesteps, torch.IntTensor)
|
||||
@@ -753,5 +895,5 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
|
||||
return velocity
|
||||
|
||||
def __len__(self):
|
||||
def __len__(self) -> int:
|
||||
return self.config.num_train_timesteps
|
||||
|
||||
@@ -160,7 +160,7 @@ class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -473,7 +473,20 @@ class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -499,7 +512,19 @@ class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""Constructs an exponential noise schedule."""
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -523,7 +548,24 @@ class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
|
||||
@@ -102,7 +102,7 @@ class FlowMatchHeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
|
||||
@@ -168,7 +168,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -473,7 +473,20 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -499,7 +512,19 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""Constructs an exponential noise schedule."""
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -523,7 +548,24 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -49,10 +49,10 @@ class HeunDiscreteSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -187,7 +188,23 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -229,7 +246,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -354,6 +371,19 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -378,7 +408,20 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -404,7 +447,19 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""Constructs an exponential noise schedule."""
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -428,7 +483,24 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -461,7 +533,14 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return self.dt is None
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep):
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -579,6 +658,21 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -78,7 +78,7 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -112,7 +112,23 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
|
||||
self._begin_index = None
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -127,7 +143,14 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep):
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -50,10 +50,10 @@ class KDPM2AncestralDiscreteSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -61,16 +61,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -206,7 +207,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -342,6 +343,19 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -366,7 +380,20 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -392,7 +419,19 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""Constructs an exponential noise schedule."""
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -416,7 +455,24 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -449,7 +505,23 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return self.sample is None
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -464,7 +536,14 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep):
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -586,6 +665,21 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -49,10 +49,10 @@ class KDPM2DiscreteSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -60,16 +60,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -206,7 +207,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -330,7 +331,23 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return self.sample is None
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -345,7 +362,14 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep):
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -355,6 +379,19 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -379,7 +416,20 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -405,7 +455,19 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""Constructs an exponential noise schedule."""
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -429,7 +491,24 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -558,6 +637,21 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -51,10 +51,10 @@ class LCMSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -62,16 +62,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -99,13 +100,13 @@ def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
the betas that the scheduler is being initialized with.
|
||||
The betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -251,7 +252,23 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
self._begin_index = None
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -266,7 +283,14 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep):
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -291,7 +315,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -315,6 +339,8 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -322,6 +348,14 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -597,6 +631,22 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
@@ -619,6 +669,21 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction from the sample and noise according to the velocity formula.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps for velocity computation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed velocity.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as sample
|
||||
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
|
||||
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
|
||||
@@ -642,6 +707,17 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
|
||||
def previous_timestep(self, timestep):
|
||||
"""
|
||||
Compute the previous timestep in the diffusion chain.
|
||||
|
||||
Args:
|
||||
timestep (`int`):
|
||||
The current timestep.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The previous timestep.
|
||||
"""
|
||||
if self.custom_timesteps or self.num_inference_steps:
|
||||
index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
|
||||
if index == self.timesteps.shape[0] - 1:
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
import math
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import scipy.stats
|
||||
@@ -47,10 +47,10 @@ class LMSDiscreteSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -58,16 +58,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -209,7 +210,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -319,7 +320,23 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
self.derivatives = []
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -334,7 +351,14 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep):
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -344,6 +368,19 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -382,7 +419,19 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""Constructs an exponential noise schedule."""
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -406,7 +455,24 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -521,6 +587,21 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
|
||||
|
||||
import math
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -26,10 +26,10 @@ from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, Schedul
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -37,16 +37,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -452,6 +453,22 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple, Union
|
||||
from typing import Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -45,10 +45,10 @@ class RePaintSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -56,16 +56,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
|
||||
|
||||
import math
|
||||
from typing import Callable, List, Optional, Tuple, Union
|
||||
from typing import Callable, List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -33,10 +33,10 @@ if is_scipy_available():
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -44,16 +44,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -253,7 +254,7 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -342,6 +343,8 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -349,6 +352,14 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -375,6 +386,19 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -410,7 +434,20 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -436,7 +473,19 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""Constructs an exponential noise schedule."""
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -460,7 +509,24 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -1193,6 +1259,22 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
|
||||
@@ -109,7 +109,7 @@ class SCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -173,7 +173,14 @@ class SCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
self._begin_index = None
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep):
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -182,7 +189,23 @@ class SCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
self._step_index = self._begin_index
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -50,10 +50,10 @@ class TCDSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -61,16 +61,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -98,13 +99,13 @@ def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
the betas that the scheduler is being initialized with.
|
||||
The betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -252,7 +253,23 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
self._begin_index = None
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -267,7 +284,14 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep):
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -292,7 +316,7 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -316,6 +340,24 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler._get_variance
|
||||
def _get_variance(self, timestep, prev_timestep):
|
||||
"""
|
||||
Computes the variance of the noise added at a given diffusion step.
|
||||
|
||||
For a given `timestep` and its previous step, this method calculates the variance as defined in DDIM/DDPM
|
||||
literature:
|
||||
var_t = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
|
||||
where alpha_prod and beta_prod are cumulative products of alphas and betas, respectively.
|
||||
|
||||
Args:
|
||||
timestep (`int`):
|
||||
The current timestep in the diffusion process.
|
||||
prev_timestep (`int`):
|
||||
The previous timestep in the diffusion process. If negative, uses `final_alpha_cumprod`.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The variance for the current timestep.
|
||||
"""
|
||||
alpha_prod_t = self.alphas_cumprod[timestep]
|
||||
alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
|
||||
beta_prod_t = 1 - alpha_prod_t
|
||||
@@ -328,6 +370,8 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -335,6 +379,14 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -634,6 +686,22 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
@@ -656,6 +724,21 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction from the sample and noise according to the velocity formula.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps for velocity computation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed velocity.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as sample
|
||||
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
|
||||
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
|
||||
@@ -679,6 +762,17 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
|
||||
def previous_timestep(self, timestep):
|
||||
"""
|
||||
Compute the previous timestep in the diffusion chain.
|
||||
|
||||
Args:
|
||||
timestep (`int`):
|
||||
The current timestep.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The previous timestep.
|
||||
"""
|
||||
if self.custom_timesteps or self.num_inference_steps:
|
||||
index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
|
||||
if index == self.timesteps.shape[0] - 1:
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, Tuple, Union
|
||||
from typing import Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -46,10 +46,10 @@ class UnCLIPSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -57,16 +57,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -334,6 +335,22 @@ class UnCLIPScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
|
||||
|
||||
import math
|
||||
from typing import List, Optional, Tuple, Union
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -32,10 +32,10 @@ if is_scipy_available():
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -43,16 +43,17 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
|
||||
Returns:
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -80,13 +81,13 @@ def rescale_zero_terminal_snr(betas):
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
the betas that the scheduler is being initialized with.
|
||||
The betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -297,7 +298,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`):
|
||||
begin_index (`int`, defaults to `0`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -432,6 +433,8 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -439,6 +442,14 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -465,6 +476,19 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -500,7 +524,20 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -526,7 +563,19 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""Constructs an exponential noise schedule."""
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -550,7 +599,24 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
|
||||
@@ -108,6 +108,7 @@ from .import_utils import (
|
||||
is_tensorboard_available,
|
||||
is_timm_available,
|
||||
is_torch_available,
|
||||
is_torch_mlu_available,
|
||||
is_torch_npu_available,
|
||||
is_torch_version,
|
||||
is_torch_xla_available,
|
||||
|
||||
@@ -42,7 +42,7 @@ HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(HF_HOME, "modules"
|
||||
DEPRECATED_REVISION_ARGS = ["fp16", "non-ema"]
|
||||
DIFFUSERS_REQUEST_TIMEOUT = 60
|
||||
DIFFUSERS_ATTN_BACKEND = os.getenv("DIFFUSERS_ATTN_BACKEND", "native")
|
||||
DIFFUSERS_ATTN_CHECKS = os.getenv("DIFFUSERS_ATTN_CHECKS", "0") in ENV_VARS_TRUE_VALUES
|
||||
DIFFUSERS_ATTN_CHECKS = os.getenv("DIFFUSERS_ATTN_CHECKS", "0").upper() in ENV_VARS_TRUE_VALUES
|
||||
DEFAULT_HF_PARALLEL_LOADING_WORKERS = 8
|
||||
HF_ENABLE_PARALLEL_LOADING = os.environ.get("HF_ENABLE_PARALLEL_LOADING", "").upper() in ENV_VARS_TRUE_VALUES
|
||||
DIFFUSERS_DISABLE_REMOTE_CODE = os.getenv("DIFFUSERS_DISABLE_REMOTE_CODE", "false").upper() in ENV_VARS_TRUE_VALUES
|
||||
|
||||
@@ -1623,6 +1623,21 @@ class VQModel(metaclass=DummyObject):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
|
||||
class WanAnimateTransformer3DModel(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
|
||||
class WanTransformer3DModel(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
|
||||
@@ -3512,6 +3512,21 @@ class VQDiffusionPipeline(metaclass=DummyObject):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class WanAnimatePipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class WanImageToVideoPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
|
||||
@@ -192,6 +192,7 @@ except importlib_metadata.PackageNotFoundError:
|
||||
|
||||
_torch_xla_available, _torch_xla_version = _is_package_available("torch_xla")
|
||||
_torch_npu_available, _torch_npu_version = _is_package_available("torch_npu")
|
||||
_torch_mlu_available, _torch_mlu_version = _is_package_available("torch_mlu")
|
||||
_transformers_available, _transformers_version = _is_package_available("transformers")
|
||||
_hf_hub_available, _hf_hub_version = _is_package_available("huggingface_hub")
|
||||
_kernels_available, _kernels_version = _is_package_available("kernels")
|
||||
@@ -243,6 +244,10 @@ def is_torch_npu_available():
|
||||
return _torch_npu_available
|
||||
|
||||
|
||||
def is_torch_mlu_available():
|
||||
return _torch_mlu_available
|
||||
|
||||
|
||||
def is_flax_available():
|
||||
return _flax_available
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ import os
|
||||
from typing import Callable, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from . import logging
|
||||
from .import_utils import is_torch_available, is_torch_npu_available, is_torch_version
|
||||
from .import_utils import is_torch_available, is_torch_mlu_available, is_torch_npu_available, is_torch_version
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
@@ -242,8 +242,8 @@ def fourier_filter(x_in: "torch.Tensor", threshold: int, scale: int) -> "torch.T
|
||||
def apply_freeu(
|
||||
resolution_idx: int, hidden_states: "torch.Tensor", res_hidden_states: "torch.Tensor", **freeu_kwargs
|
||||
) -> Tuple["torch.Tensor", "torch.Tensor"]:
|
||||
"""Applies the FreeU mechanism as introduced in https:
|
||||
//arxiv.org/abs/2309.11497. Adapted from the official code repository: https://github.com/ChenyangSi/FreeU.
|
||||
"""Applies the FreeU mechanism as introduced in https://huggingface.co/papers/2309.11497. Adapted from the official
|
||||
code repository: https://github.com/ChenyangSi/FreeU.
|
||||
|
||||
Args:
|
||||
resolution_idx (`int`): Integer denoting the UNet block where FreeU is being applied.
|
||||
@@ -286,6 +286,8 @@ def get_device():
|
||||
return "xpu"
|
||||
elif torch.backends.mps.is_available():
|
||||
return "mps"
|
||||
elif is_torch_mlu_available():
|
||||
return "mlu"
|
||||
else:
|
||||
return "cpu"
|
||||
|
||||
|
||||
@@ -82,3 +82,7 @@ class AutoencoderDCTests(ModelTesterMixin, AutoencoderTesterMixin, unittest.Test
|
||||
@unittest.skipIf(IS_GITHUB_ACTIONS, reason="Skipping test inside GitHub Actions environment")
|
||||
def test_layerwise_casting_inference(self):
|
||||
super().test_layerwise_casting_inference()
|
||||
|
||||
@unittest.skipIf(IS_GITHUB_ACTIONS, reason="Skipping test inside GitHub Actions environment")
|
||||
def test_layerwise_casting_memory(self):
|
||||
super().test_layerwise_casting_memory()
|
||||
|
||||
@@ -0,0 +1,126 @@
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
|
||||
from diffusers import WanAnimateTransformer3DModel
|
||||
|
||||
from ...testing_utils import (
|
||||
enable_full_determinism,
|
||||
torch_device,
|
||||
)
|
||||
from ..test_modeling_common import ModelTesterMixin, TorchCompileTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
|
||||
|
||||
class WanAnimateTransformer3DTests(ModelTesterMixin, unittest.TestCase):
|
||||
model_class = WanAnimateTransformer3DModel
|
||||
main_input_name = "hidden_states"
|
||||
uses_custom_attn_processor = True
|
||||
|
||||
@property
|
||||
def dummy_input(self):
|
||||
batch_size = 1
|
||||
num_channels = 4
|
||||
num_frames = 20 # To make the shapes work out; for complicated reasons we want 21 to divide num_frames + 1
|
||||
height = 16
|
||||
width = 16
|
||||
text_encoder_embedding_dim = 16
|
||||
sequence_length = 12
|
||||
|
||||
clip_seq_len = 12
|
||||
clip_dim = 16
|
||||
|
||||
inference_segment_length = 77 # The inference segment length in the full Wan2.2-Animate-14B model
|
||||
face_height = 16 # Should be square and match `motion_encoder_size` below
|
||||
face_width = 16
|
||||
|
||||
hidden_states = torch.randn((batch_size, 2 * num_channels + 4, num_frames + 1, height, width)).to(torch_device)
|
||||
timestep = torch.randint(0, 1000, size=(batch_size,)).to(torch_device)
|
||||
encoder_hidden_states = torch.randn((batch_size, sequence_length, text_encoder_embedding_dim)).to(torch_device)
|
||||
clip_ref_features = torch.randn((batch_size, clip_seq_len, clip_dim)).to(torch_device)
|
||||
pose_latents = torch.randn((batch_size, num_channels, num_frames, height, width)).to(torch_device)
|
||||
face_pixel_values = torch.randn((batch_size, 3, inference_segment_length, face_height, face_width)).to(
|
||||
torch_device
|
||||
)
|
||||
|
||||
return {
|
||||
"hidden_states": hidden_states,
|
||||
"timestep": timestep,
|
||||
"encoder_hidden_states": encoder_hidden_states,
|
||||
"encoder_hidden_states_image": clip_ref_features,
|
||||
"pose_hidden_states": pose_latents,
|
||||
"face_pixel_values": face_pixel_values,
|
||||
}
|
||||
|
||||
@property
|
||||
def input_shape(self):
|
||||
return (12, 1, 16, 16)
|
||||
|
||||
@property
|
||||
def output_shape(self):
|
||||
return (4, 1, 16, 16)
|
||||
|
||||
def prepare_init_args_and_inputs_for_common(self):
|
||||
# Use custom channel sizes since the default Wan Animate channel sizes will cause the motion encoder to
|
||||
# contain the vast majority of the parameters in the test model
|
||||
channel_sizes = {"4": 16, "8": 16, "16": 16}
|
||||
|
||||
init_dict = {
|
||||
"patch_size": (1, 2, 2),
|
||||
"num_attention_heads": 2,
|
||||
"attention_head_dim": 12,
|
||||
"in_channels": 12, # 2 * C + 4 = 2 * 4 + 4 = 12
|
||||
"latent_channels": 4,
|
||||
"out_channels": 4,
|
||||
"text_dim": 16,
|
||||
"freq_dim": 256,
|
||||
"ffn_dim": 32,
|
||||
"num_layers": 2,
|
||||
"cross_attn_norm": True,
|
||||
"qk_norm": "rms_norm_across_heads",
|
||||
"image_dim": 16,
|
||||
"rope_max_seq_len": 32,
|
||||
"motion_encoder_channel_sizes": channel_sizes, # Start of Wan Animate-specific config
|
||||
"motion_encoder_size": 16, # Ensures that there will be 2 motion encoder resblocks
|
||||
"motion_style_dim": 8,
|
||||
"motion_dim": 4,
|
||||
"motion_encoder_dim": 16,
|
||||
"face_encoder_hidden_dim": 16,
|
||||
"face_encoder_num_heads": 2,
|
||||
"inject_face_latents_blocks": 2,
|
||||
}
|
||||
inputs_dict = self.dummy_input
|
||||
return init_dict, inputs_dict
|
||||
|
||||
def test_gradient_checkpointing_is_applied(self):
|
||||
expected_set = {"WanAnimateTransformer3DModel"}
|
||||
super().test_gradient_checkpointing_is_applied(expected_set=expected_set)
|
||||
|
||||
# Override test_output because the transformer output is expected to have less channels than the main transformer
|
||||
# input.
|
||||
def test_output(self):
|
||||
expected_output_shape = (1, 4, 21, 16, 16)
|
||||
super().test_output(expected_output_shape=expected_output_shape)
|
||||
|
||||
|
||||
class WanAnimateTransformerCompileTests(TorchCompileTesterMixin, unittest.TestCase):
|
||||
model_class = WanAnimateTransformer3DModel
|
||||
|
||||
def prepare_init_args_and_inputs_for_common(self):
|
||||
return WanAnimateTransformer3DTests().prepare_init_args_and_inputs_for_common()
|
||||
@@ -55,6 +55,9 @@ class TestFluxModularPipelineFast(ModularPipelineTesterMixin):
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_float16_inference(self):
|
||||
super().test_float16_inference(9e-2)
|
||||
|
||||
|
||||
class TestFluxImg2ImgModularPipelineFast(ModularPipelineTesterMixin):
|
||||
pipeline_class = FluxModularPipeline
|
||||
@@ -118,6 +121,9 @@ class TestFluxImg2ImgModularPipelineFast(ModularPipelineTesterMixin):
|
||||
|
||||
assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3
|
||||
|
||||
def test_float16_inference(self):
|
||||
super().test_float16_inference(8e-2)
|
||||
|
||||
|
||||
class TestFluxKontextModularPipelineFast(ModularPipelineTesterMixin):
|
||||
pipeline_class = FluxKontextModularPipeline
|
||||
@@ -170,3 +176,6 @@ class TestFluxKontextModularPipelineFast(ModularPipelineTesterMixin):
|
||||
image_slices.append(image[0, -3:, -3:, -1].flatten())
|
||||
|
||||
assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3
|
||||
|
||||
def test_float16_inference(self):
|
||||
super().test_float16_inference(9e-2)
|
||||
|
||||
@@ -0,0 +1,120 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
|
||||
import PIL
|
||||
import pytest
|
||||
|
||||
from diffusers.modular_pipelines import (
|
||||
QwenImageAutoBlocks,
|
||||
QwenImageEditAutoBlocks,
|
||||
QwenImageEditModularPipeline,
|
||||
QwenImageEditPlusAutoBlocks,
|
||||
QwenImageEditPlusModularPipeline,
|
||||
QwenImageModularPipeline,
|
||||
)
|
||||
|
||||
from ..test_modular_pipelines_common import ModularGuiderTesterMixin, ModularPipelineTesterMixin
|
||||
|
||||
|
||||
class TestQwenImageModularPipelineFast(ModularPipelineTesterMixin, ModularGuiderTesterMixin):
|
||||
pipeline_class = QwenImageModularPipeline
|
||||
pipeline_blocks_class = QwenImageAutoBlocks
|
||||
repo = "hf-internal-testing/tiny-qwenimage-modular"
|
||||
|
||||
params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image", "mask_image"])
|
||||
batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
|
||||
|
||||
def get_dummy_inputs(self):
|
||||
generator = self.get_generator()
|
||||
inputs = {
|
||||
"prompt": "dance monkey",
|
||||
"negative_prompt": "bad quality",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"height": 32,
|
||||
"width": 32,
|
||||
"max_sequence_length": 16,
|
||||
"output_type": "pt",
|
||||
}
|
||||
return inputs
|
||||
|
||||
def test_inference_batch_single_identical(self):
|
||||
super().test_inference_batch_single_identical(expected_max_diff=5e-4)
|
||||
|
||||
|
||||
class TestQwenImageEditModularPipelineFast(ModularPipelineTesterMixin, ModularGuiderTesterMixin):
|
||||
pipeline_class = QwenImageEditModularPipeline
|
||||
pipeline_blocks_class = QwenImageEditAutoBlocks
|
||||
repo = "hf-internal-testing/tiny-qwenimage-edit-modular"
|
||||
|
||||
params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image", "mask_image"])
|
||||
batch_params = frozenset(["prompt", "negative_prompt", "image", "mask_image"])
|
||||
|
||||
def get_dummy_inputs(self):
|
||||
generator = self.get_generator()
|
||||
inputs = {
|
||||
"prompt": "dance monkey",
|
||||
"negative_prompt": "bad quality",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"height": 32,
|
||||
"width": 32,
|
||||
"output_type": "pt",
|
||||
}
|
||||
inputs["image"] = PIL.Image.new("RGB", (32, 32), 0)
|
||||
return inputs
|
||||
|
||||
def test_guider_cfg(self):
|
||||
super().test_guider_cfg(7e-5)
|
||||
|
||||
|
||||
class TestQwenImageEditPlusModularPipelineFast(ModularPipelineTesterMixin, ModularGuiderTesterMixin):
|
||||
pipeline_class = QwenImageEditPlusModularPipeline
|
||||
pipeline_blocks_class = QwenImageEditPlusAutoBlocks
|
||||
repo = "hf-internal-testing/tiny-qwenimage-edit-plus-modular"
|
||||
|
||||
# No `mask_image` yet.
|
||||
params = frozenset(["prompt", "height", "width", "negative_prompt", "attention_kwargs", "image"])
|
||||
batch_params = frozenset(["prompt", "negative_prompt", "image"])
|
||||
|
||||
def get_dummy_inputs(self):
|
||||
generator = self.get_generator()
|
||||
inputs = {
|
||||
"prompt": "dance monkey",
|
||||
"negative_prompt": "bad quality",
|
||||
"generator": generator,
|
||||
"num_inference_steps": 2,
|
||||
"height": 32,
|
||||
"width": 32,
|
||||
"output_type": "pt",
|
||||
}
|
||||
inputs["image"] = PIL.Image.new("RGB", (32, 32), 0)
|
||||
return inputs
|
||||
|
||||
@pytest.mark.xfail(condition=True, reason="Batch of multiple images needs to be revisited", strict=True)
|
||||
def test_num_images_per_prompt(self):
|
||||
super().test_num_images_per_prompt()
|
||||
|
||||
@pytest.mark.xfail(condition=True, reason="Batch of multiple images needs to be revisited", strict=True)
|
||||
def test_inference_batch_consistent():
|
||||
super().test_inference_batch_consistent()
|
||||
|
||||
@pytest.mark.xfail(condition=True, reason="Batch of multiple images needs to be revisited", strict=True)
|
||||
def test_inference_batch_single_identical():
|
||||
super().test_inference_batch_single_identical()
|
||||
|
||||
def test_guider_cfg(self):
|
||||
super().test_guider_cfg(1e-3)
|
||||
+11
-65
@@ -25,7 +25,7 @@ from diffusers.loaders import ModularIPAdapterMixin
|
||||
|
||||
from ...models.unets.test_models_unet_2d_condition import create_ip_adapter_state_dict
|
||||
from ...testing_utils import enable_full_determinism, floats_tensor, torch_device
|
||||
from ..test_modular_pipelines_common import ModularPipelineTesterMixin
|
||||
from ..test_modular_pipelines_common import ModularGuiderTesterMixin, ModularPipelineTesterMixin
|
||||
|
||||
|
||||
enable_full_determinism()
|
||||
@@ -37,13 +37,11 @@ class SDXLModularTesterMixin:
|
||||
"""
|
||||
|
||||
def _test_stable_diffusion_xl_euler(self, expected_image_shape, expected_slice, expected_max_diff=1e-2):
|
||||
sd_pipe = self.get_pipeline()
|
||||
sd_pipe = sd_pipe.to(torch_device)
|
||||
sd_pipe.set_progress_bar_config(disable=None)
|
||||
sd_pipe = self.get_pipeline().to(torch_device)
|
||||
|
||||
inputs = self.get_dummy_inputs()
|
||||
image = sd_pipe(**inputs, output="images")
|
||||
image_slice = image[0, -3:, -3:, -1]
|
||||
image_slice = image[0, -3:, -3:, -1].cpu()
|
||||
|
||||
assert image.shape == expected_image_shape
|
||||
max_diff = torch.abs(image_slice.flatten() - expected_slice).max()
|
||||
@@ -110,7 +108,7 @@ class SDXLModularIPAdapterTesterMixin:
|
||||
pipe = blocks.init_pipeline(self.repo)
|
||||
pipe.load_components(torch_dtype=torch.float32)
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
cross_attention_dim = pipe.unet.config.get("cross_attention_dim")
|
||||
|
||||
# forward pass without ip adapter
|
||||
@@ -219,9 +217,7 @@ class SDXLModularControlNetTesterMixin:
|
||||
# compare against static slices and that can be shaky (with a VVVV low probability).
|
||||
expected_max_diff = 9e-4 if torch_device == "cpu" else expected_max_diff
|
||||
|
||||
pipe = self.get_pipeline()
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe = self.get_pipeline().to(torch_device)
|
||||
|
||||
# forward pass without controlnet
|
||||
inputs = self.get_dummy_inputs()
|
||||
@@ -251,9 +247,7 @@ class SDXLModularControlNetTesterMixin:
|
||||
assert max_diff_with_controlnet_scale > 1e-2, "Output with controlnet must be different from normal inference"
|
||||
|
||||
def test_controlnet_cfg(self):
|
||||
pipe = self.get_pipeline()
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe = self.get_pipeline().to(torch_device)
|
||||
|
||||
# forward pass with CFG not applied
|
||||
guider = ClassifierFreeGuidance(guidance_scale=1.0)
|
||||
@@ -273,35 +267,11 @@ class SDXLModularControlNetTesterMixin:
|
||||
assert max_diff > 1e-2, "Output with CFG must be different from normal inference"
|
||||
|
||||
|
||||
class SDXLModularGuiderTesterMixin:
|
||||
def test_guider_cfg(self):
|
||||
pipe = self.get_pipeline()
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
# forward pass with CFG not applied
|
||||
guider = ClassifierFreeGuidance(guidance_scale=1.0)
|
||||
pipe.update_components(guider=guider)
|
||||
|
||||
inputs = self.get_dummy_inputs()
|
||||
out_no_cfg = pipe(**inputs, output="images")
|
||||
|
||||
# forward pass with CFG applied
|
||||
guider = ClassifierFreeGuidance(guidance_scale=7.5)
|
||||
pipe.update_components(guider=guider)
|
||||
inputs = self.get_dummy_inputs()
|
||||
out_cfg = pipe(**inputs, output="images")
|
||||
|
||||
assert out_cfg.shape == out_no_cfg.shape
|
||||
max_diff = np.abs(out_cfg - out_no_cfg).max()
|
||||
assert max_diff > 1e-2, "Output with CFG must be different from normal inference"
|
||||
|
||||
|
||||
class TestSDXLModularPipelineFast(
|
||||
SDXLModularTesterMixin,
|
||||
SDXLModularIPAdapterTesterMixin,
|
||||
SDXLModularControlNetTesterMixin,
|
||||
SDXLModularGuiderTesterMixin,
|
||||
ModularGuiderTesterMixin,
|
||||
ModularPipelineTesterMixin,
|
||||
):
|
||||
"""Test cases for Stable Diffusion XL modular pipeline fast tests."""
|
||||
@@ -335,18 +305,7 @@ class TestSDXLModularPipelineFast(
|
||||
self._test_stable_diffusion_xl_euler(
|
||||
expected_image_shape=self.expected_image_output_shape,
|
||||
expected_slice=torch.tensor(
|
||||
[
|
||||
0.5966781,
|
||||
0.62939394,
|
||||
0.48465094,
|
||||
0.51573336,
|
||||
0.57593524,
|
||||
0.47035995,
|
||||
0.53410417,
|
||||
0.51436996,
|
||||
0.47313565,
|
||||
],
|
||||
device=torch_device,
|
||||
[0.3886, 0.4685, 0.4953, 0.4217, 0.4317, 0.3945, 0.4847, 0.4704, 0.4731],
|
||||
),
|
||||
expected_max_diff=1e-2,
|
||||
)
|
||||
@@ -359,7 +318,7 @@ class TestSDXLImg2ImgModularPipelineFast(
|
||||
SDXLModularTesterMixin,
|
||||
SDXLModularIPAdapterTesterMixin,
|
||||
SDXLModularControlNetTesterMixin,
|
||||
SDXLModularGuiderTesterMixin,
|
||||
ModularGuiderTesterMixin,
|
||||
ModularPipelineTesterMixin,
|
||||
):
|
||||
"""Test cases for Stable Diffusion XL image-to-image modular pipeline fast tests."""
|
||||
@@ -400,20 +359,7 @@ class TestSDXLImg2ImgModularPipelineFast(
|
||||
def test_stable_diffusion_xl_euler(self):
|
||||
self._test_stable_diffusion_xl_euler(
|
||||
expected_image_shape=self.expected_image_output_shape,
|
||||
expected_slice=torch.tensor(
|
||||
[
|
||||
0.56943184,
|
||||
0.4702148,
|
||||
0.48048905,
|
||||
0.6235963,
|
||||
0.551138,
|
||||
0.49629188,
|
||||
0.60031277,
|
||||
0.5688907,
|
||||
0.43996853,
|
||||
],
|
||||
device=torch_device,
|
||||
),
|
||||
expected_slice=torch.tensor([0.5246, 0.4466, 0.444, 0.3246, 0.4443, 0.5108, 0.5225, 0.559, 0.5147]),
|
||||
expected_max_diff=1e-2,
|
||||
)
|
||||
|
||||
@@ -425,7 +371,7 @@ class SDXLInpaintingModularPipelineFastTests(
|
||||
SDXLModularTesterMixin,
|
||||
SDXLModularIPAdapterTesterMixin,
|
||||
SDXLModularControlNetTesterMixin,
|
||||
SDXLModularGuiderTesterMixin,
|
||||
ModularGuiderTesterMixin,
|
||||
ModularPipelineTesterMixin,
|
||||
):
|
||||
"""Test cases for Stable Diffusion XL inpainting modular pipeline fast tests."""
|
||||
|
||||
@@ -2,22 +2,17 @@ import gc
|
||||
import tempfile
|
||||
from typing import Callable, Union
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import diffusers
|
||||
from diffusers import ComponentsManager, ModularPipeline, ModularPipelineBlocks
|
||||
from diffusers.guiders import ClassifierFreeGuidance
|
||||
from diffusers.utils import logging
|
||||
|
||||
from ..testing_utils import (
|
||||
backend_empty_cache,
|
||||
numpy_cosine_similarity_distance,
|
||||
require_accelerator,
|
||||
require_torch,
|
||||
torch_device,
|
||||
)
|
||||
from ..testing_utils import backend_empty_cache, numpy_cosine_similarity_distance, require_accelerator, torch_device
|
||||
|
||||
|
||||
@require_torch
|
||||
class ModularPipelineTesterMixin:
|
||||
"""
|
||||
It provides a set of common tests for each modular pipeline,
|
||||
@@ -32,20 +27,9 @@ class ModularPipelineTesterMixin:
|
||||
# Canonical parameters that are passed to `__call__` regardless
|
||||
# of the type of pipeline. They are always optional and have common
|
||||
# sense default values.
|
||||
optional_params = frozenset(
|
||||
[
|
||||
"num_inference_steps",
|
||||
"num_images_per_prompt",
|
||||
"latents",
|
||||
"output_type",
|
||||
]
|
||||
)
|
||||
optional_params = frozenset(["num_inference_steps", "num_images_per_prompt", "latents", "output_type"])
|
||||
# this is modular specific: generator needs to be a intermediate input because it's mutable
|
||||
intermediate_params = frozenset(
|
||||
[
|
||||
"generator",
|
||||
]
|
||||
)
|
||||
intermediate_params = frozenset(["generator"])
|
||||
|
||||
def get_generator(self, seed=0):
|
||||
generator = torch.Generator("cpu").manual_seed(seed)
|
||||
@@ -121,6 +105,7 @@ class ModularPipelineTesterMixin:
|
||||
def get_pipeline(self, components_manager=None, torch_dtype=torch.float32):
|
||||
pipeline = self.pipeline_blocks_class().init_pipeline(self.repo, components_manager=components_manager)
|
||||
pipeline.load_components(torch_dtype=torch_dtype)
|
||||
pipeline.set_progress_bar_config(disable=None)
|
||||
return pipeline
|
||||
|
||||
def test_pipeline_call_signature(self):
|
||||
@@ -138,9 +123,7 @@ class ModularPipelineTesterMixin:
|
||||
_check_for_parameters(self.optional_params, optional_parameters, "optional")
|
||||
|
||||
def test_inference_batch_consistent(self, batch_sizes=[2], batch_generator=True):
|
||||
pipe = self.get_pipeline()
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe = self.get_pipeline().to(torch_device)
|
||||
|
||||
inputs = self.get_dummy_inputs()
|
||||
inputs["generator"] = self.get_generator(0)
|
||||
@@ -179,9 +162,8 @@ class ModularPipelineTesterMixin:
|
||||
batch_size=2,
|
||||
expected_max_diff=1e-4,
|
||||
):
|
||||
pipe = self.get_pipeline()
|
||||
pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe = self.get_pipeline().to(torch_device)
|
||||
|
||||
inputs = self.get_dummy_inputs()
|
||||
|
||||
# Reset generator in case it is has been used in self.get_dummy_inputs
|
||||
@@ -219,11 +201,9 @@ class ModularPipelineTesterMixin:
|
||||
def test_float16_inference(self, expected_max_diff=5e-2):
|
||||
pipe = self.get_pipeline()
|
||||
pipe.to(torch_device, torch.float32)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
|
||||
pipe_fp16 = self.get_pipeline()
|
||||
pipe_fp16.to(torch_device, torch.float16)
|
||||
pipe_fp16.set_progress_bar_config(disable=None)
|
||||
|
||||
inputs = self.get_dummy_inputs()
|
||||
# Reset generator in case it is used inside dummy inputs
|
||||
@@ -237,19 +217,16 @@ class ModularPipelineTesterMixin:
|
||||
fp16_inputs["generator"] = self.get_generator(0)
|
||||
output_fp16 = pipe_fp16(**fp16_inputs, output="images")
|
||||
|
||||
if isinstance(output, torch.Tensor):
|
||||
output = output.cpu()
|
||||
output_fp16 = output_fp16.cpu()
|
||||
output = output.cpu()
|
||||
output_fp16 = output_fp16.cpu()
|
||||
|
||||
max_diff = numpy_cosine_similarity_distance(output.flatten(), output_fp16.flatten())
|
||||
assert max_diff < expected_max_diff, "FP16 inference is different from FP32 inference"
|
||||
|
||||
@require_accelerator
|
||||
def test_to_device(self):
|
||||
pipe = self.get_pipeline()
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe = self.get_pipeline().to("cpu")
|
||||
|
||||
pipe.to("cpu")
|
||||
model_devices = [
|
||||
component.device.type for component in pipe.components.values() if hasattr(component, "device")
|
||||
]
|
||||
@@ -264,30 +241,23 @@ class ModularPipelineTesterMixin:
|
||||
)
|
||||
|
||||
def test_inference_is_not_nan_cpu(self):
|
||||
pipe = self.get_pipeline()
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to("cpu")
|
||||
pipe = self.get_pipeline().to("cpu")
|
||||
|
||||
output = pipe(**self.get_dummy_inputs(), output="images")
|
||||
assert torch.isnan(output).sum() == 0, "CPU Inference returns NaN"
|
||||
|
||||
@require_accelerator
|
||||
def test_inference_is_not_nan(self):
|
||||
pipe = self.get_pipeline()
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pipe.to(torch_device)
|
||||
pipe = self.get_pipeline().to(torch_device)
|
||||
|
||||
output = pipe(**self.get_dummy_inputs(), output="images")
|
||||
assert torch.isnan(output).sum() == 0, "Accelerator Inference returns NaN"
|
||||
|
||||
def test_num_images_per_prompt(self):
|
||||
pipe = self.get_pipeline()
|
||||
pipe = self.get_pipeline().to(torch_device)
|
||||
|
||||
if "num_images_per_prompt" not in pipe.blocks.input_names:
|
||||
return
|
||||
|
||||
pipe = pipe.to(torch_device)
|
||||
pipe.set_progress_bar_config(disable=None)
|
||||
pytest.mark.skip("Skipping test as `num_images_per_prompt` is not present in input names.")
|
||||
|
||||
batch_sizes = [1, 2]
|
||||
num_images_per_prompts = [1, 2]
|
||||
@@ -342,3 +312,25 @@ class ModularPipelineTesterMixin:
|
||||
image_slices.append(image[0, -3:, -3:, -1].flatten())
|
||||
|
||||
assert torch.abs(image_slices[0] - image_slices[1]).max() < 1e-3
|
||||
|
||||
|
||||
class ModularGuiderTesterMixin:
|
||||
def test_guider_cfg(self, expected_max_diff=1e-2):
|
||||
pipe = self.get_pipeline().to(torch_device)
|
||||
|
||||
# forward pass with CFG not applied
|
||||
guider = ClassifierFreeGuidance(guidance_scale=1.0)
|
||||
pipe.update_components(guider=guider)
|
||||
|
||||
inputs = self.get_dummy_inputs()
|
||||
out_no_cfg = pipe(**inputs, output="images")
|
||||
|
||||
# forward pass with CFG applied
|
||||
guider = ClassifierFreeGuidance(guidance_scale=7.5)
|
||||
pipe.update_components(guider=guider)
|
||||
inputs = self.get_dummy_inputs()
|
||||
out_cfg = pipe(**inputs, output="images")
|
||||
|
||||
assert out_cfg.shape == out_no_cfg.shape
|
||||
max_diff = torch.abs(out_cfg - out_no_cfg).max()
|
||||
assert max_diff > expected_max_diff, "Output with CFG must be different from normal inference"
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user