Compare commits
3 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| bffa3a9754 | |||
| 1c558712e8 | |||
| 1f026ad14e |
@@ -84,7 +84,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-k "not Flax and not Onnx" \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
--report-log=tests_pipeline_${{ matrix.module }}_cuda.log \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
@@ -138,7 +138,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-k "not Flax and not Onnx" \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_torch_${{ matrix.module }}_cuda \
|
||||
--report-log=tests_torch_${{ matrix.module }}_cuda.log \
|
||||
tests/${{ matrix.module }}
|
||||
@@ -151,7 +151,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
--make-reports=examples_torch_cuda \
|
||||
-s -v --make-reports=examples_torch_cuda \
|
||||
--report-log=examples_torch_cuda.log \
|
||||
examples/
|
||||
|
||||
@@ -198,7 +198,7 @@ jobs:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
RUN_COMPILE: yes
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "compile" --make-reports=tests_torch_compile_cuda tests/
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_compile_cuda_failures_short.txt
|
||||
@@ -293,7 +293,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-k "not Flax and not Onnx" \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_torch_minimum_version_cuda \
|
||||
tests/models/test_modeling_common.py \
|
||||
tests/pipelines/test_pipelines_common.py \
|
||||
@@ -531,7 +531,7 @@ jobs:
|
||||
# HF_HOME: /System/Volumes/Data/mnt/cache
|
||||
# HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
# run: |
|
||||
# ${CONDA_RUN} pytest -n 1 --make-reports=tests_torch_mps \
|
||||
# ${CONDA_RUN} pytest -n 1 -s -v --make-reports=tests_torch_mps \
|
||||
# --report-log=tests_torch_mps.log \
|
||||
# tests/
|
||||
# - name: Failure short reports
|
||||
@@ -587,7 +587,7 @@ jobs:
|
||||
# HF_HOME: /System/Volumes/Data/mnt/cache
|
||||
# HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
# run: |
|
||||
# ${CONDA_RUN} pytest -n 1 --make-reports=tests_torch_mps \
|
||||
# ${CONDA_RUN} pytest -n 1 -s -v --make-reports=tests_torch_mps \
|
||||
# --report-log=tests_torch_mps.log \
|
||||
# tests/
|
||||
# - name: Failure short reports
|
||||
|
||||
@@ -120,7 +120,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
|
||||
run: |
|
||||
pytest -n 8 --max-worker-restart=0 --dist=loadfile \
|
||||
-k "not Flax and not Onnx" \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/modular_pipelines
|
||||
|
||||
|
||||
@@ -126,7 +126,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'pytorch_pipelines' }}
|
||||
run: |
|
||||
pytest -n 8 --max-worker-restart=0 --dist=loadfile \
|
||||
-k "not Flax and not Onnx" \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/pipelines
|
||||
|
||||
@@ -134,7 +134,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'pytorch_models' }}
|
||||
run: |
|
||||
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-k "not Flax and not Onnx and not Dependency" \
|
||||
-s -v -k "not Flax and not Onnx and not Dependency" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/models tests/schedulers tests/others
|
||||
|
||||
@@ -255,11 +255,11 @@ jobs:
|
||||
- name: Run fast PyTorch LoRA tests with PEFT
|
||||
run: |
|
||||
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
\
|
||||
-s -v \
|
||||
--make-reports=tests_peft_main \
|
||||
tests/lora/
|
||||
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
\
|
||||
-s -v \
|
||||
--make-reports=tests_models_lora_peft_main \
|
||||
tests/models/ -k "lora"
|
||||
|
||||
|
||||
@@ -151,13 +151,13 @@ jobs:
|
||||
run: |
|
||||
if [ "${{ matrix.module }}" = "ip_adapters" ]; then
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-k "not Flax and not Onnx" \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
else
|
||||
pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-k "not Flax and not Onnx and $pattern" \
|
||||
-s -v -k "not Flax and not Onnx and $pattern" \
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
fi
|
||||
@@ -222,10 +222,10 @@ jobs:
|
||||
run: |
|
||||
pattern=$(cat ${{ steps.extract_tests.outputs.pattern_file }})
|
||||
if [ -z "$pattern" ]; then
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
|
||||
pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx" tests/${{ matrix.module }} \
|
||||
--make-reports=tests_torch_cuda_${{ matrix.module }}
|
||||
else
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
|
||||
pytest -n 1 -sv --max-worker-restart=0 --dist=loadfile -k "not Flax and not Onnx and $pattern" tests/${{ matrix.module }} \
|
||||
--make-reports=tests_torch_cuda_${{ matrix.module }}
|
||||
fi
|
||||
|
||||
@@ -274,7 +274,7 @@ jobs:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
run: |
|
||||
uv pip install ".[training]"
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile --make-reports=examples_torch_cuda examples/
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
|
||||
@@ -87,7 +87,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-k "not Flax and not Onnx" \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
- name: Failure short reports
|
||||
@@ -141,7 +141,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-k "not Flax and not Onnx" \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_torch_cuda_${{ matrix.module }} \
|
||||
tests/${{ matrix.module }}
|
||||
|
||||
@@ -189,7 +189,7 @@ jobs:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
RUN_COMPILE: yes
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "compile" --make-reports=tests_torch_compile_cuda tests/
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_compile_cuda_failures_short.txt
|
||||
@@ -230,7 +230,7 @@ jobs:
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_xformers_cuda_failures_short.txt
|
||||
@@ -273,7 +273,7 @@ jobs:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
run: |
|
||||
uv pip install ".[training]"
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile --make-reports=examples_torch_cuda examples/
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
|
||||
@@ -70,7 +70,7 @@ jobs:
|
||||
if: ${{ matrix.config.framework == 'pytorch' }}
|
||||
run: |
|
||||
pytest -n 4 --max-worker-restart=0 --dist=loadfile \
|
||||
-k "not Flax and not Onnx" \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_${{ matrix.config.report }} \
|
||||
tests/
|
||||
|
||||
|
||||
@@ -57,7 +57,7 @@ jobs:
|
||||
HF_HOME: /System/Volumes/Data/mnt/cache
|
||||
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
||||
run: |
|
||||
${CONDA_RUN} python -m pytest -n 0 --make-reports=tests_torch_mps tests/
|
||||
${CONDA_RUN} python -m pytest -n 0 -s -v --make-reports=tests_torch_mps tests/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
|
||||
@@ -84,7 +84,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-k "not Flax and not Onnx" \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_pipeline_${{ matrix.module }}_cuda \
|
||||
tests/pipelines/${{ matrix.module }}
|
||||
- name: Failure short reports
|
||||
@@ -137,7 +137,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-k "not Flax and not Onnx" \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_torch_${{ matrix.module }}_cuda \
|
||||
tests/${{ matrix.module }}
|
||||
|
||||
@@ -187,7 +187,7 @@ jobs:
|
||||
CUBLAS_WORKSPACE_CONFIG: :16:8
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile \
|
||||
-k "not Flax and not Onnx" \
|
||||
-s -v -k "not Flax and not Onnx" \
|
||||
--make-reports=tests_torch_minimum_cuda \
|
||||
tests/models/test_modeling_common.py \
|
||||
tests/pipelines/test_pipelines_common.py \
|
||||
@@ -240,7 +240,7 @@ jobs:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
RUN_COMPILE: yes
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "compile" --make-reports=tests_torch_compile_cuda tests/
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "compile" --make-reports=tests_torch_compile_cuda tests/
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_compile_cuda_failures_short.txt
|
||||
@@ -281,7 +281,7 @@ jobs:
|
||||
env:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
run: |
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v -k "xformers" --make-reports=tests_torch_xformers_cuda tests/
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
run: cat reports/tests_torch_xformers_cuda_failures_short.txt
|
||||
@@ -326,7 +326,7 @@ jobs:
|
||||
HF_TOKEN: ${{ secrets.DIFFUSERS_HF_HUB_READ_TOKEN }}
|
||||
run: |
|
||||
uv pip install ".[training]"
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile --make-reports=examples_torch_cuda examples/
|
||||
pytest -n 1 --max-worker-restart=0 --dist=loadfile -s -v --make-reports=examples_torch_cuda examples/
|
||||
|
||||
- name: Failure short reports
|
||||
if: ${{ failure() }}
|
||||
|
||||
@@ -22,8 +22,6 @@
|
||||
title: Reproducibility
|
||||
- local: using-diffusers/schedulers
|
||||
title: Schedulers
|
||||
- local: using-diffusers/automodel
|
||||
title: AutoModel
|
||||
- local: using-diffusers/other-formats
|
||||
title: Model formats
|
||||
- local: using-diffusers/push_to_hub
|
||||
@@ -121,8 +119,6 @@
|
||||
title: ComponentsManager
|
||||
- local: modular_diffusers/guiders
|
||||
title: Guiders
|
||||
- local: modular_diffusers/custom_blocks
|
||||
title: Building Custom Blocks
|
||||
title: Modular Diffusers
|
||||
- isExpanded: false
|
||||
sections:
|
||||
@@ -391,8 +387,6 @@
|
||||
title: Transformer2DModel
|
||||
- local: api/models/transformer_temporal
|
||||
title: TransformerTemporalModel
|
||||
- local: api/models/wan_animate_transformer_3d
|
||||
title: WanAnimateTransformer3DModel
|
||||
- local: api/models/wan_transformer_3d
|
||||
title: WanTransformer3DModel
|
||||
title: Transformers
|
||||
@@ -454,8 +448,6 @@
|
||||
- sections:
|
||||
- local: api/pipelines/overview
|
||||
title: Overview
|
||||
- local: api/pipelines/auto_pipeline
|
||||
title: AutoPipeline
|
||||
- sections:
|
||||
- local: api/pipelines/audioldm
|
||||
title: AudioLDM
|
||||
@@ -468,6 +460,8 @@
|
||||
- local: api/pipelines/stable_audio
|
||||
title: Stable Audio
|
||||
title: Audio
|
||||
- local: api/pipelines/auto_pipeline
|
||||
title: AutoPipeline
|
||||
- sections:
|
||||
- local: api/pipelines/amused
|
||||
title: aMUSEd
|
||||
@@ -531,8 +525,6 @@
|
||||
title: HiDream-I1
|
||||
- local: api/pipelines/hunyuandit
|
||||
title: Hunyuan-DiT
|
||||
- local: api/pipelines/hunyuanimage21
|
||||
title: HunyuanImage2.1
|
||||
- local: api/pipelines/pix2pix
|
||||
title: InstructPix2Pix
|
||||
- local: api/pipelines/kandinsky
|
||||
@@ -646,6 +638,8 @@
|
||||
title: ConsisID
|
||||
- local: api/pipelines/framepack
|
||||
title: Framepack
|
||||
- local: api/pipelines/hunyuanimage21
|
||||
title: HunyuanImage2.1
|
||||
- local: api/pipelines/hunyuan_video
|
||||
title: HunyuanVideo
|
||||
- local: api/pipelines/i2vgenxl
|
||||
|
||||
@@ -12,7 +12,15 @@ specific language governing permissions and limitations under the License.
|
||||
|
||||
# AutoModel
|
||||
|
||||
[`AutoModel`] automatically retrieves the correct model class from the checkpoint `config.json` file.
|
||||
The `AutoModel` is designed to make it easy to load a checkpoint without needing to know the specific model class. `AutoModel` automatically retrieves the correct model class from the checkpoint `config.json` file.
|
||||
|
||||
```python
|
||||
from diffusers import AutoModel, AutoPipelineForText2Image
|
||||
|
||||
unet = AutoModel.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", subfolder="unet")
|
||||
pipe = AutoPipelineForText2Image.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", unet=unet)
|
||||
```
|
||||
|
||||
|
||||
## AutoModel
|
||||
|
||||
|
||||
@@ -1,30 +0,0 @@
|
||||
<!-- Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License. -->
|
||||
|
||||
# WanAnimateTransformer3DModel
|
||||
|
||||
A Diffusion Transformer model for 3D video-like data was introduced in [Wan Animate](https://github.com/Wan-Video/Wan2.2) by the Alibaba Wan Team.
|
||||
|
||||
The model can be loaded with the following code snippet.
|
||||
|
||||
```python
|
||||
from diffusers import WanAnimateTransformer3DModel
|
||||
|
||||
transformer = WanAnimateTransformer3DModel.from_pretrained("Wan-AI/Wan2.2-Animate-14B-Diffusers", subfolder="transformer", torch_dtype=torch.bfloat16)
|
||||
```
|
||||
|
||||
## WanAnimateTransformer3DModel
|
||||
|
||||
[[autodoc]] WanAnimateTransformer3DModel
|
||||
|
||||
## Transformer2DModelOutput
|
||||
|
||||
[[autodoc]] models.modeling_outputs.Transformer2DModelOutput
|
||||
@@ -40,7 +40,6 @@ The following Wan models are supported in Diffusers:
|
||||
- [Wan 2.2 T2V 14B](https://huggingface.co/Wan-AI/Wan2.2-T2V-A14B-Diffusers)
|
||||
- [Wan 2.2 I2V 14B](https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers)
|
||||
- [Wan 2.2 TI2V 5B](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B-Diffusers)
|
||||
- [Wan 2.2 Animate 14B](https://huggingface.co/Wan-AI/Wan2.2-Animate-14B-Diffusers)
|
||||
|
||||
> [!TIP]
|
||||
> Click on the Wan models in the right sidebar for more examples of video generation.
|
||||
@@ -96,15 +95,15 @@ pipeline = WanPipeline.from_pretrained(
|
||||
pipeline.to("cuda")
|
||||
|
||||
prompt = """
|
||||
The camera rushes from far to near in a low-angle shot,
|
||||
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
|
||||
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
|
||||
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
|
||||
The camera rushes from far to near in a low-angle shot,
|
||||
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
|
||||
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
|
||||
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
|
||||
shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
|
||||
"""
|
||||
negative_prompt = """
|
||||
Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality,
|
||||
low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured,
|
||||
Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality,
|
||||
low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured,
|
||||
misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards
|
||||
"""
|
||||
|
||||
@@ -151,15 +150,15 @@ pipeline.transformer = torch.compile(
|
||||
)
|
||||
|
||||
prompt = """
|
||||
The camera rushes from far to near in a low-angle shot,
|
||||
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
|
||||
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
|
||||
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
|
||||
The camera rushes from far to near in a low-angle shot,
|
||||
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
|
||||
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
|
||||
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
|
||||
shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
|
||||
"""
|
||||
negative_prompt = """
|
||||
Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality,
|
||||
low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured,
|
||||
Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality,
|
||||
low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured,
|
||||
misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards
|
||||
"""
|
||||
|
||||
@@ -250,208 +249,6 @@ The code snippets available in [this](https://github.com/huggingface/diffusers/p
|
||||
|
||||
The general rule of thumb to keep in mind when preparing inputs for the VACE pipeline is that the input images, or frames of a video that you want to use for conditioning, should have a corresponding mask that is black in color. The black mask signifies that the model will not generate new content for that area, and only use those parts for conditioning the generation process. For parts/frames that should be generated by the model, the mask should be white in color.
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
### Wan-Animate: Unified Character Animation and Replacement with Holistic Replication
|
||||
|
||||
[Wan-Animate](https://huggingface.co/papers/2509.14055) by the Wan Team.
|
||||
|
||||
*We introduce Wan-Animate, a unified framework for character animation and replacement. Given a character image and a reference video, Wan-Animate can animate the character by precisely replicating the expressions and movements of the character in the video to generate high-fidelity character videos. Alternatively, it can integrate the animated character into the reference video to replace the original character, replicating the scene's lighting and color tone to achieve seamless environmental integration. Wan-Animate is built upon the Wan model. To adapt it for character animation tasks, we employ a modified input paradigm to differentiate between reference conditions and regions for generation. This design unifies multiple tasks into a common symbolic representation. We use spatially-aligned skeleton signals to replicate body motion and implicit facial features extracted from source images to reenact expressions, enabling the generation of character videos with high controllability and expressiveness. Furthermore, to enhance environmental integration during character replacement, we develop an auxiliary Relighting LoRA. This module preserves the character's appearance consistency while applying the appropriate environmental lighting and color tone. Experimental results demonstrate that Wan-Animate achieves state-of-the-art performance. We are committed to open-sourcing the model weights and its source code.*
|
||||
|
||||
The project page: https://humanaigc.github.io/wan-animate
|
||||
|
||||
This model was mostly contributed by [M. Tolga Cangöz](https://github.com/tolgacangoz).
|
||||
|
||||
#### Usage
|
||||
|
||||
The Wan-Animate pipeline supports two modes of operation:
|
||||
|
||||
1. **Animation Mode** (default): Animates a character image based on motion and expression from reference videos
|
||||
2. **Replacement Mode**: Replaces a character in a background video with a new character while preserving the scene
|
||||
|
||||
##### Prerequisites
|
||||
|
||||
Before using the pipeline, you need to preprocess your reference video to extract:
|
||||
- **Pose video**: Contains skeletal keypoints representing body motion
|
||||
- **Face video**: Contains facial feature representations for expression control
|
||||
|
||||
For replacement mode, you additionally need:
|
||||
- **Background video**: The original video containing the scene
|
||||
- **Mask video**: A mask indicating where to generate content (white) vs. preserve original (black)
|
||||
|
||||
> [!NOTE]
|
||||
> Raw videos should not be used for inputs such as `pose_video`, which the pipeline expects to be preprocessed to extract the proper information. Preprocessing scripts to prepare these inputs are available in the [original Wan-Animate repository](https://github.com/Wan-Video/Wan2.2?tab=readme-ov-file#1-preprocessing). Integration of these preprocessing steps into Diffusers is planned for a future release.
|
||||
|
||||
The example below demonstrates how to use the Wan-Animate pipeline:
|
||||
|
||||
<hfoptions id="Animate usage">
|
||||
<hfoption id="Animation mode">
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
import torch
|
||||
from diffusers import AutoencoderKLWan, WanAnimatePipeline
|
||||
from diffusers.utils import export_to_video, load_image, load_video
|
||||
|
||||
model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
|
||||
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
||||
pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
|
||||
pipe.to("cuda")
|
||||
|
||||
# Load character image and preprocessed videos
|
||||
image = load_image("path/to/character.jpg")
|
||||
pose_video = load_video("path/to/pose_video.mp4") # Preprocessed skeletal keypoints
|
||||
face_video = load_video("path/to/face_video.mp4") # Preprocessed facial features
|
||||
|
||||
# Resize image to match VAE constraints
|
||||
def aspect_ratio_resize(image, pipe, max_area=720 * 1280):
|
||||
aspect_ratio = image.height / image.width
|
||||
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
|
||||
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
|
||||
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
|
||||
image = image.resize((width, height))
|
||||
return image, height, width
|
||||
|
||||
image, height, width = aspect_ratio_resize(image, pipe)
|
||||
|
||||
prompt = "A person dancing energetically in a studio with dynamic lighting and professional camera work"
|
||||
negative_prompt = "blurry, low quality, distorted, deformed, static, poorly drawn"
|
||||
|
||||
# Generate animated video
|
||||
output = pipe(
|
||||
image=image,
|
||||
pose_video=pose_video,
|
||||
face_video=face_video,
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
height=height,
|
||||
width=width,
|
||||
segment_frame_length=77,
|
||||
guidance_scale=1.0,
|
||||
mode="animate", # Animation mode (default)
|
||||
).frames[0]
|
||||
export_to_video(output, "animated_character.mp4", fps=30)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Replacement mode">
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
import torch
|
||||
from diffusers import AutoencoderKLWan, WanAnimatePipeline
|
||||
from diffusers.utils import export_to_video, load_image, load_video
|
||||
|
||||
model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
|
||||
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
||||
pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
|
||||
pipe.to("cuda")
|
||||
|
||||
# Load all required inputs for replacement mode
|
||||
image = load_image("path/to/new_character.jpg")
|
||||
pose_video = load_video("path/to/pose_video.mp4") # Preprocessed skeletal keypoints
|
||||
face_video = load_video("path/to/face_video.mp4") # Preprocessed facial features
|
||||
background_video = load_video("path/to/background_video.mp4") # Original scene
|
||||
mask_video = load_video("path/to/mask_video.mp4") # Black: preserve, White: generate
|
||||
|
||||
# Resize image to match video dimensions
|
||||
def aspect_ratio_resize(image, pipe, max_area=720 * 1280):
|
||||
aspect_ratio = image.height / image.width
|
||||
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
|
||||
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
|
||||
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
|
||||
image = image.resize((width, height))
|
||||
return image, height, width
|
||||
|
||||
image, height, width = aspect_ratio_resize(image, pipe)
|
||||
|
||||
prompt = "A person seamlessly integrated into the scene with consistent lighting and environment"
|
||||
negative_prompt = "blurry, low quality, inconsistent lighting, floating, disconnected from scene"
|
||||
|
||||
# Replace character in background video
|
||||
output = pipe(
|
||||
image=image,
|
||||
pose_video=pose_video,
|
||||
face_video=face_video,
|
||||
background_video=background_video,
|
||||
mask_video=mask_video,
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
height=height,
|
||||
width=width,
|
||||
segment_frame_lengths=77,
|
||||
guidance_scale=1.0,
|
||||
mode="replace", # Replacement mode
|
||||
).frames[0]
|
||||
export_to_video(output, "character_replaced.mp4", fps=30)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
<hfoption id="Advanced options">
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
import torch
|
||||
from diffusers import AutoencoderKLWan, WanAnimatePipeline
|
||||
from diffusers.utils import export_to_video, load_image, load_video
|
||||
|
||||
model_id = "Wan-AI/Wan2.2-Animate-14B-Diffusers"
|
||||
vae = AutoencoderKLWan.from_pretrained(model_id, subfolder="vae", torch_dtype=torch.float32)
|
||||
pipe = WanAnimatePipeline.from_pretrained(model_id, vae=vae, torch_dtype=torch.bfloat16)
|
||||
pipe.to("cuda")
|
||||
|
||||
image = load_image("path/to/character.jpg")
|
||||
pose_video = load_video("path/to/pose_video.mp4")
|
||||
face_video = load_video("path/to/face_video.mp4")
|
||||
|
||||
def aspect_ratio_resize(image, pipe, max_area=720 * 1280):
|
||||
aspect_ratio = image.height / image.width
|
||||
mod_value = pipe.vae_scale_factor_spatial * pipe.transformer.config.patch_size[1]
|
||||
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value * mod_value
|
||||
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value * mod_value
|
||||
image = image.resize((width, height))
|
||||
return image, height, width
|
||||
|
||||
image, height, width = aspect_ratio_resize(image, pipe)
|
||||
|
||||
prompt = "A person dancing energetically in a studio"
|
||||
negative_prompt = "blurry, low quality"
|
||||
|
||||
# Advanced: Use temporal guidance and custom callback
|
||||
def callback_fn(pipe, step_index, timestep, callback_kwargs):
|
||||
# You can modify latents or other tensors here
|
||||
print(f"Step {step_index}, Timestep {timestep}")
|
||||
return callback_kwargs
|
||||
|
||||
output = pipe(
|
||||
image=image,
|
||||
pose_video=pose_video,
|
||||
face_video=face_video,
|
||||
prompt=prompt,
|
||||
negative_prompt=negative_prompt,
|
||||
height=height,
|
||||
width=width,
|
||||
segment_frame_length=77,
|
||||
num_inference_steps=50,
|
||||
guidance_scale=5.0,
|
||||
prev_segment_conditioning_frames=5, # Use 5 frames for temporal guidance (1 or 5 recommended)
|
||||
callback_on_step_end=callback_fn,
|
||||
callback_on_step_end_tensor_inputs=["latents"],
|
||||
).frames[0]
|
||||
export_to_video(output, "animated_advanced.mp4", fps=30)
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
#### Key Parameters
|
||||
|
||||
- **mode**: Choose between `"animate"` (default) or `"replace"`
|
||||
- **prev_segment_conditioning_frames**: Number of frames for temporal guidance (1 or 5 recommended). Using 5 provides better temporal consistency but requires more memory
|
||||
- **guidance_scale**: Controls how closely the output follows the text prompt. Higher values (5-7) produce results more aligned with the prompt. For Wan-Animate, CFG is disabled by default (`guidance_scale=1.0`) but can be enabled to support negative prompts and finer control over facial expressions. (Note that CFG will only target the text prompt and face conditioning.)
|
||||
|
||||
|
||||
## Notes
|
||||
|
||||
- Wan2.1 supports LoRAs with [`~loaders.WanLoraLoaderMixin.load_lora_weights`].
|
||||
@@ -484,10 +281,10 @@ export_to_video(output, "animated_advanced.mp4", fps=30)
|
||||
|
||||
# use "steamboat willie style" to trigger the LoRA
|
||||
prompt = """
|
||||
steamboat willie style, golden era animation, The camera rushes from far to near in a low-angle shot,
|
||||
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
|
||||
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
|
||||
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
|
||||
steamboat willie style, golden era animation, The camera rushes from far to near in a low-angle shot,
|
||||
revealing a white ferret on a log. It plays, leaps into the water, and emerges, as the camera zooms in
|
||||
for a close-up. Water splashes berry bushes nearby, while moss, snow, and leaves blanket the ground.
|
||||
Birch trees and a light blue sky frame the scene, with ferns in the foreground. Side lighting casts dynamic
|
||||
shadows and warm highlights. Medium composition, front view, low angle, with depth of field.
|
||||
"""
|
||||
|
||||
@@ -562,12 +359,6 @@ export_to_video(output, "animated_advanced.mp4", fps=30)
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## WanAnimatePipeline
|
||||
|
||||
[[autodoc]] WanAnimatePipeline
|
||||
- all
|
||||
- __call__
|
||||
|
||||
## WanPipelineOutput
|
||||
|
||||
[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput
|
||||
[[autodoc]] pipelines.wan.pipeline_output.WanPipelineOutput
|
||||
@@ -1,492 +0,0 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
|
||||
# Building Custom Blocks
|
||||
|
||||
[ModularPipelineBlocks](./pipeline_block) are the fundamental building blocks of a [`ModularPipeline`]. You can create custom blocks by defining their inputs, outputs, and computation logic. This guide demonstrates how to create and use a custom block.
|
||||
|
||||
> [!TIP]
|
||||
> Explore the [Modular Diffusers Custom Blocks](https://huggingface.co/collections/diffusers/modular-diffusers-custom-blocks) collection for official custom modular blocks like Nano Banana.
|
||||
|
||||
## Project Structure
|
||||
|
||||
Your custom block project should use the following structure:
|
||||
|
||||
```shell
|
||||
.
|
||||
├── block.py
|
||||
└── modular_config.json
|
||||
```
|
||||
|
||||
- `block.py` contains the custom block implementation
|
||||
- `modular_config.json` contains the metadata needed to load the block
|
||||
|
||||
## Example: Florence 2 Inpainting Block
|
||||
|
||||
In this example we will create a custom block that uses the [Florence 2](https://huggingface.co/docs/transformers/model_doc/florence2) model to process an input image and generate a mask for inpainting.
|
||||
|
||||
The first step is to define the components that the block will use. In this case, we will need to use the `Florence2ForConditionalGeneration` model and its corresponding processor `AutoProcessor`. When defining components, we must specify the name of the component within our pipeline, model class via `type_hint`, and provide a `pretrained_model_name_or_path` for the component if we intend to load the model weights from a specific repository on the Hub.
|
||||
|
||||
```py
|
||||
# Inside block.py
|
||||
from diffusers.modular_pipelines import (
|
||||
ModularPipelineBlocks,
|
||||
ComponentSpec,
|
||||
)
|
||||
from transformers import AutoProcessor, Florence2ForConditionalGeneration
|
||||
|
||||
|
||||
class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
|
||||
|
||||
@property
|
||||
def expected_components(self):
|
||||
return [
|
||||
ComponentSpec(
|
||||
name="image_annotator",
|
||||
type_hint=Florence2ForConditionalGeneration,
|
||||
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
|
||||
),
|
||||
ComponentSpec(
|
||||
name="image_annotator_processor",
|
||||
type_hint=AutoProcessor,
|
||||
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
|
||||
),
|
||||
]
|
||||
```
|
||||
|
||||
Next, we define the inputs and outputs of the block. The inputs include the image to be annotated, the annotation task, and the annotation prompt. The outputs include the generated mask image and annotations.
|
||||
|
||||
```py
|
||||
from typing import List, Union
|
||||
from PIL import Image, ImageDraw
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from diffusers.modular_pipelines import (
|
||||
PipelineState,
|
||||
ModularPipelineBlocks,
|
||||
InputParam,
|
||||
ComponentSpec,
|
||||
OutputParam,
|
||||
)
|
||||
from transformers import AutoProcessor, Florence2ForConditionalGeneration
|
||||
|
||||
|
||||
class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
|
||||
|
||||
@property
|
||||
def expected_components(self):
|
||||
return [
|
||||
ComponentSpec(
|
||||
name="image_annotator",
|
||||
type_hint=Florence2ForConditionalGeneration,
|
||||
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
|
||||
),
|
||||
ComponentSpec(
|
||||
name="image_annotator_processor",
|
||||
type_hint=AutoProcessor,
|
||||
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
|
||||
),
|
||||
]
|
||||
|
||||
@property
|
||||
def inputs(self) -> List[InputParam]:
|
||||
return [
|
||||
InputParam(
|
||||
"image",
|
||||
type_hint=Union[Image.Image, List[Image.Image]],
|
||||
required=True,
|
||||
description="Image(s) to annotate",
|
||||
),
|
||||
InputParam(
|
||||
"annotation_task",
|
||||
type_hint=Union[str, List[str]],
|
||||
required=True,
|
||||
default="<REFERRING_EXPRESSION_SEGMENTATION>",
|
||||
description="""Annotation Task to perform on the image.
|
||||
Supported Tasks:
|
||||
|
||||
<OD>
|
||||
<REFERRING_EXPRESSION_SEGMENTATION>
|
||||
<CAPTION>
|
||||
<DETAILED_CAPTION>
|
||||
<MORE_DETAILED_CAPTION>
|
||||
<DENSE_REGION_CAPTION>
|
||||
<CAPTION_TO_PHRASE_GROUNDING>
|
||||
<OPEN_VOCABULARY_DETECTION>
|
||||
|
||||
""",
|
||||
),
|
||||
InputParam(
|
||||
"annotation_prompt",
|
||||
type_hint=Union[str, List[str]],
|
||||
required=True,
|
||||
description="""Annotation Prompt to provide more context to the task.
|
||||
Can be used to detect or segment out specific elements in the image
|
||||
""",
|
||||
),
|
||||
InputParam(
|
||||
"annotation_output_type",
|
||||
type_hint=str,
|
||||
required=True,
|
||||
default="mask_image",
|
||||
description="""Output type from annotation predictions. Availabe options are
|
||||
mask_image:
|
||||
-black and white mask image for the given image based on the task type
|
||||
mask_overlay:
|
||||
- mask overlayed on the original image
|
||||
bounding_box:
|
||||
- bounding boxes drawn on the original image
|
||||
""",
|
||||
),
|
||||
InputParam(
|
||||
"annotation_overlay",
|
||||
type_hint=bool,
|
||||
required=True,
|
||||
default=False,
|
||||
description="",
|
||||
),
|
||||
]
|
||||
|
||||
@property
|
||||
def intermediate_outputs(self) -> List[OutputParam]:
|
||||
return [
|
||||
OutputParam(
|
||||
"mask_image",
|
||||
type_hint=Image,
|
||||
description="Inpainting Mask for input Image(s)",
|
||||
),
|
||||
OutputParam(
|
||||
"annotations",
|
||||
type_hint=dict,
|
||||
description="Annotations Predictions for input Image(s)",
|
||||
),
|
||||
OutputParam(
|
||||
"image",
|
||||
type_hint=Image,
|
||||
description="Annotated input Image(s)",
|
||||
),
|
||||
]
|
||||
|
||||
```
|
||||
|
||||
Now we implement the `__call__` method, which contains the logic for processing the input image and generating the mask.
|
||||
|
||||
```py
|
||||
from typing import List, Union
|
||||
from PIL import Image, ImageDraw
|
||||
import torch
|
||||
import numpy as np
|
||||
|
||||
from diffusers.modular_pipelines import (
|
||||
PipelineState,
|
||||
ModularPipelineBlocks,
|
||||
InputParam,
|
||||
ComponentSpec,
|
||||
OutputParam,
|
||||
)
|
||||
from transformers import AutoProcessor, Florence2ForConditionalGeneration
|
||||
|
||||
|
||||
class Florence2ImageAnnotatorBlock(ModularPipelineBlocks):
|
||||
|
||||
@property
|
||||
def expected_components(self):
|
||||
return [
|
||||
ComponentSpec(
|
||||
name="image_annotator",
|
||||
type_hint=Florence2ForConditionalGeneration,
|
||||
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
|
||||
),
|
||||
ComponentSpec(
|
||||
name="image_annotator_processor",
|
||||
type_hint=AutoProcessor,
|
||||
pretrained_model_name_or_path="florence-community/Florence-2-base-ft",
|
||||
),
|
||||
]
|
||||
|
||||
@property
|
||||
def inputs(self) -> List[InputParam]:
|
||||
return [
|
||||
InputParam(
|
||||
"image",
|
||||
type_hint=Union[Image.Image, List[Image.Image]],
|
||||
required=True,
|
||||
description="Image(s) to annotate",
|
||||
),
|
||||
InputParam(
|
||||
"annotation_task",
|
||||
type_hint=Union[str, List[str]],
|
||||
required=True,
|
||||
default="<REFERRING_EXPRESSION_SEGMENTATION>",
|
||||
description="""Annotation Task to perform on the image.
|
||||
Supported Tasks:
|
||||
|
||||
<OD>
|
||||
<REFERRING_EXPRESSION_SEGMENTATION>
|
||||
<CAPTION>
|
||||
<DETAILED_CAPTION>
|
||||
<MORE_DETAILED_CAPTION>
|
||||
<DENSE_REGION_CAPTION>
|
||||
<CAPTION_TO_PHRASE_GROUNDING>
|
||||
<OPEN_VOCABULARY_DETECTION>
|
||||
|
||||
""",
|
||||
),
|
||||
InputParam(
|
||||
"annotation_prompt",
|
||||
type_hint=Union[str, List[str]],
|
||||
required=True,
|
||||
description="""Annotation Prompt to provide more context to the task.
|
||||
Can be used to detect or segment out specific elements in the image
|
||||
""",
|
||||
),
|
||||
InputParam(
|
||||
"annotation_output_type",
|
||||
type_hint=str,
|
||||
required=True,
|
||||
default="mask_image",
|
||||
description="""Output type from annotation predictions. Availabe options are
|
||||
mask_image:
|
||||
-black and white mask image for the given image based on the task type
|
||||
mask_overlay:
|
||||
- mask overlayed on the original image
|
||||
bounding_box:
|
||||
- bounding boxes drawn on the original image
|
||||
""",
|
||||
),
|
||||
InputParam(
|
||||
"annotation_overlay",
|
||||
type_hint=bool,
|
||||
required=True,
|
||||
default=False,
|
||||
description="",
|
||||
),
|
||||
]
|
||||
|
||||
@property
|
||||
def intermediate_outputs(self) -> List[OutputParam]:
|
||||
return [
|
||||
OutputParam(
|
||||
"mask_image",
|
||||
type_hint=Image,
|
||||
description="Inpainting Mask for input Image(s)",
|
||||
),
|
||||
OutputParam(
|
||||
"annotations",
|
||||
type_hint=dict,
|
||||
description="Annotations Predictions for input Image(s)",
|
||||
),
|
||||
OutputParam(
|
||||
"image",
|
||||
type_hint=Image,
|
||||
description="Annotated input Image(s)",
|
||||
),
|
||||
]
|
||||
|
||||
def get_annotations(self, components, images, prompts, task):
|
||||
task_prompts = [task + prompt for prompt in prompts]
|
||||
|
||||
inputs = components.image_annotator_processor(
|
||||
text=task_prompts, images=images, return_tensors="pt"
|
||||
).to(components.image_annotator.device, components.image_annotator.dtype)
|
||||
|
||||
generated_ids = components.image_annotator.generate(
|
||||
input_ids=inputs["input_ids"],
|
||||
pixel_values=inputs["pixel_values"],
|
||||
max_new_tokens=1024,
|
||||
early_stopping=False,
|
||||
do_sample=False,
|
||||
num_beams=3,
|
||||
)
|
||||
annotations = components.image_annotator_processor.batch_decode(
|
||||
generated_ids, skip_special_tokens=False
|
||||
)
|
||||
outputs = []
|
||||
for image, annotation in zip(images, annotations):
|
||||
outputs.append(
|
||||
components.image_annotator_processor.post_process_generation(
|
||||
annotation, task=task, image_size=(image.width, image.height)
|
||||
)
|
||||
)
|
||||
return outputs
|
||||
|
||||
def prepare_mask(self, images, annotations, overlay=False, fill="white"):
|
||||
masks = []
|
||||
for image, annotation in zip(images, annotations):
|
||||
mask_image = image.copy() if overlay else Image.new("L", image.size, 0)
|
||||
draw = ImageDraw.Draw(mask_image)
|
||||
|
||||
for _, _annotation in annotation.items():
|
||||
if "polygons" in _annotation:
|
||||
for polygon in _annotation["polygons"]:
|
||||
polygon = np.array(polygon).reshape(-1, 2)
|
||||
if len(polygon) < 3:
|
||||
continue
|
||||
polygon = polygon.reshape(-1).tolist()
|
||||
draw.polygon(polygon, fill=fill)
|
||||
|
||||
elif "bbox" in _annotation:
|
||||
bbox = _annotation["bbox"]
|
||||
draw.rectangle(bbox, fill="white")
|
||||
|
||||
masks.append(mask_image)
|
||||
|
||||
return masks
|
||||
|
||||
def prepare_bounding_boxes(self, images, annotations):
|
||||
outputs = []
|
||||
for image, annotation in zip(images, annotations):
|
||||
image_copy = image.copy()
|
||||
draw = ImageDraw.Draw(image_copy)
|
||||
for _, _annotation in annotation.items():
|
||||
bbox = _annotation["bbox"]
|
||||
label = _annotation["label"]
|
||||
|
||||
draw.rectangle(bbox, outline="red", width=3)
|
||||
draw.text((bbox[0], bbox[1] - 20), label, fill="red")
|
||||
|
||||
outputs.append(image_copy)
|
||||
|
||||
return outputs
|
||||
|
||||
def prepare_inputs(self, images, prompts):
|
||||
prompts = prompts or ""
|
||||
|
||||
if isinstance(images, Image.Image):
|
||||
images = [images]
|
||||
if isinstance(prompts, str):
|
||||
prompts = [prompts]
|
||||
|
||||
if len(images) != len(prompts):
|
||||
raise ValueError("Number of images and annotation prompts must match.")
|
||||
|
||||
return images, prompts
|
||||
|
||||
@torch.no_grad()
|
||||
def __call__(self, components, state: PipelineState) -> PipelineState:
|
||||
block_state = self.get_block_state(state)
|
||||
images, annotation_task_prompt = self.prepare_inputs(
|
||||
block_state.image, block_state.annotation_prompt
|
||||
)
|
||||
task = block_state.annotation_task
|
||||
fill = block_state.fill
|
||||
|
||||
annotations = self.get_annotations(
|
||||
components, images, annotation_task_prompt, task
|
||||
)
|
||||
block_state.annotations = annotations
|
||||
if block_state.annotation_output_type == "mask_image":
|
||||
block_state.mask_image = self.prepare_mask(images, annotations)
|
||||
else:
|
||||
block_state.mask_image = None
|
||||
|
||||
if block_state.annotation_output_type == "mask_overlay":
|
||||
block_state.image = self.prepare_mask(images, annotations, overlay=True, fill=fill)
|
||||
|
||||
elif block_state.annotation_output_type == "bounding_box":
|
||||
block_state.image = self.prepare_bounding_boxes(images, annotations)
|
||||
|
||||
self.set_block_state(state, block_state)
|
||||
|
||||
return components, state
|
||||
|
||||
```
|
||||
|
||||
Once we have defined our custom block, we can save it to the Hub, using either the CLI or the [`push_to_hub`] method. This will make it easy to share and reuse our custom block with other pipelines.
|
||||
|
||||
<hfoptions id="share">
|
||||
<hfoption id="hf CLI">
|
||||
|
||||
```shell
|
||||
# In the folder with the `block.py` file, run:
|
||||
diffusers-cli custom_block
|
||||
```
|
||||
|
||||
Then upload the block to the Hub:
|
||||
|
||||
```shell
|
||||
hf upload <your repo id> . .
|
||||
```
|
||||
</hfoption>
|
||||
<hfoption id="push_to_hub">
|
||||
|
||||
```py
|
||||
from block import Florence2ImageAnnotatorBlock
|
||||
block = Florence2ImageAnnotatorBlock()
|
||||
block.push_to_hub("<your repo id>")
|
||||
```
|
||||
|
||||
</hfoption>
|
||||
</hfoptions>
|
||||
|
||||
## Using Custom Blocks
|
||||
|
||||
Load the custom block with [`~ModularPipelineBlocks.from_pretrained`] and set `trust_remote_code=True`.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers.modular_pipelines import ModularPipelineBlocks, SequentialPipelineBlocks
|
||||
from diffusers.modular_pipelines.stable_diffusion_xl import INPAINT_BLOCKS
|
||||
from diffusers.utils import load_image
|
||||
|
||||
# Fetch the Florence2 image annotator block that will create our mask
|
||||
image_annotator_block = ModularPipelineBlocks.from_pretrained("diffusers/florence-2-custom-block", trust_remote_code=True)
|
||||
|
||||
my_blocks = INPAINT_BLOCKS.copy()
|
||||
# insert the annotation block before the image encoding step
|
||||
my_blocks.insert("image_annotator", image_annotator_block, 1)
|
||||
|
||||
# Create our initial set of inpainting blocks
|
||||
blocks = SequentialPipelineBlocks.from_blocks_dict(my_blocks)
|
||||
|
||||
repo_id = "diffusers/modular-stable-diffusion-xl-base-1.0"
|
||||
pipe = blocks.init_pipeline(repo_id)
|
||||
pipe.load_components(torch_dtype=torch.float16, device_map="cuda", trust_remote_code=True)
|
||||
|
||||
image = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg?download=true")
|
||||
image = image.resize((1024, 1024))
|
||||
|
||||
prompt = ["A red car"]
|
||||
annotation_task = "<REFERRING_EXPRESSION_SEGMENTATION>"
|
||||
annotation_prompt = ["the car"]
|
||||
|
||||
output = pipe(
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
annotation_task=annotation_task,
|
||||
annotation_prompt=annotation_prompt,
|
||||
annotation_output_type="mask_image",
|
||||
num_inference_steps=35,
|
||||
guidance_scale=7.5,
|
||||
strength=0.95,
|
||||
output="images"
|
||||
)
|
||||
output[0].save("florence-inpainting.png")
|
||||
```
|
||||
|
||||
## Editing Custom Blocks
|
||||
|
||||
By default, custom blocks are saved in your cache directory. Use the `local_dir` argument to download and edit a custom block in a specific folder.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers.modular_pipelines import ModularPipelineBlocks, SequentialPipelineBlocks
|
||||
from diffusers.modular_pipelines.stable_diffusion_xl import INPAINT_BLOCKS
|
||||
from diffusers.utils import load_image
|
||||
|
||||
# Fetch the Florence2 image annotator block that will create our mask
|
||||
image_annotator_block = ModularPipelineBlocks.from_pretrained("diffusers/florence-2-custom-block", trust_remote_code=True, local_dir="/my-local-folder")
|
||||
```
|
||||
|
||||
Any changes made to the block files in this folder will be reflected when you load the block again.
|
||||
@@ -1,46 +0,0 @@
|
||||
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
|
||||
the License. You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
|
||||
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
|
||||
specific language governing permissions and limitations under the License.
|
||||
-->
|
||||
|
||||
# AutoModel
|
||||
|
||||
The [`AutoModel`] class automatically detects and loads the correct model class (UNet, transformer, VAE) from a `config.json` file. You don't need to know the specific model class name ahead of time. It supports data types and device placement, and works across model types and libraries.
|
||||
|
||||
The example below loads a transformer from Diffusers and a text encoder from Transformers. Use the `subfolder` parameter to specify where to load the `config.json` file from.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoModel, DiffusionPipeline
|
||||
|
||||
transformer = AutoModel.from_pretrained(
|
||||
"Qwen/Qwen-Image", subfolder="transformer", torch_dtype=torch.bfloat16, device_map="cuda"
|
||||
)
|
||||
|
||||
text_encoder = AutoModel.from_pretrained(
|
||||
"Qwen/Qwen-Image", subfolder="text_encoder", torch_dtype=torch.bfloat16, device_map="cuda"
|
||||
)
|
||||
```
|
||||
|
||||
[`AutoModel`] also loads models from the [Hub](https://huggingface.co/models) that aren't included in Diffusers. Set `trust_remote_code=True` in [`AutoModel.from_pretrained`] to load custom models.
|
||||
|
||||
```py
|
||||
import torch
|
||||
from diffusers import AutoModel
|
||||
|
||||
transformer = AutoModel.from_pretrained(
|
||||
"custom/custom-transformer-model", trust_remote_code=True, torch_dtype=torch.bfloat16, device_map="cuda"
|
||||
)
|
||||
```
|
||||
|
||||
If the custom model inherits from the [`ModelMixin`] class, it gets access to the same features as Diffusers model classes, like [regional compilation](../optimization/fp16#regional-compilation) and [group offloading](../optimization/memory#group-offloading).
|
||||
|
||||
> [!NOTE]
|
||||
> Learn more about implementing custom models in the [Community components](../using-diffusers/custom_pipeline_overview#community-components) guide.
|
||||
@@ -5488,7 +5488,7 @@ Editing at Scale", many thanks to their contribution!
|
||||
|
||||
This implementation of Flux Kontext allows users to pass multiple reference images. Each image is encoded separately, and the resulting latent vectors are concatenated.
|
||||
|
||||
As explained in Section 3 of [the paper](https://huggingface.co/papers/2506.15742), the model's sequence concatenation mechanism can extend its capabilities to handle multiple reference images. However, note that the current version of Flux Kontext was not trained for this use case. In practice, stacking along the first axis does not yield correct results, while stacking along the other two axes appears to work.
|
||||
As explained in Section 3 of [the paper](https://arxiv.org/pdf/2506.15742), the model's sequence concatenation mechanism can extend its capabilities to handle multiple reference images. However, note that the current version of Flux Kontext was not trained for this use case. In practice, stacking along the first axis does not yield correct results, while stacking along the other two axes appears to work.
|
||||
|
||||
## Example Usage
|
||||
|
||||
|
||||
@@ -490,7 +490,7 @@ class RegionalPromptingStableDiffusionPipeline(
|
||||
def prepare_extra_step_kwargs(self, generator, eta):
|
||||
# prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
|
||||
# eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
|
||||
# eta corresponds to η in DDIM paper: https://huggingface.co/papers/2010.02502
|
||||
# eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
|
||||
# and should be between [0, 1]
|
||||
|
||||
accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
|
||||
@@ -841,7 +841,7 @@ class RegionalPromptingStableDiffusionPipeline(
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
Corresponds to parameter eta (η) from the [DDIM](https://huggingface.co/papers/2010.02502) paper. Only applies
|
||||
Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
|
||||
to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
|
||||
@@ -872,7 +872,7 @@ class RegionalPromptingStableDiffusionPipeline(
|
||||
[`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
|
||||
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
||||
Guidance rescale factor from [Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
|
||||
Flawed](https://arxiv.org/pdf/2305.08891.pdf). Guidance rescale factor should fix overexposure when
|
||||
using zero terminal SNR.
|
||||
clip_skip (`int`, *optional*):
|
||||
Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
|
||||
@@ -1062,7 +1062,7 @@ class RegionalPromptingStableDiffusionPipeline(
|
||||
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
if self.do_classifier_free_guidance and self.guidance_rescale > 0.0:
|
||||
# Based on 3.4. in https://huggingface.co/papers/2305.08891
|
||||
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
|
||||
noise_pred = rescale_noise_cfg(noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale)
|
||||
|
||||
# compute the previous noisy sample x_t -> x_t-1
|
||||
@@ -1668,7 +1668,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
||||
r"""
|
||||
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
|
||||
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://huggingface.co/papers/2305.08891).
|
||||
Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
||||
|
||||
Args:
|
||||
noise_cfg (`torch.Tensor`):
|
||||
|
||||
@@ -268,11 +268,12 @@ provide a simple script for LoRA fine-tuning Kontext in [train_dreambooth_lora_f
|
||||
**important**
|
||||
|
||||
> [!NOTE]
|
||||
> To make sure you can successfully run the latest version of the kontext example script, we highly recommend installing from source.
|
||||
> To make sure you can successfully run the latest version of the kontext example script, we highly recommend installing from source, specifically from the commit mentioned below.
|
||||
> To do this, execute the following steps in a new virtual environment:
|
||||
> ```
|
||||
> git clone https://github.com/huggingface/diffusers
|
||||
> cd diffusers
|
||||
> git checkout 05e7a854d0a5661f5b433f6dd5954c224b104f0b
|
||||
> pip install -e .
|
||||
> ```
|
||||
|
||||
|
||||
@@ -6,20 +6,11 @@ import torch
|
||||
from accelerate import init_empty_weights
|
||||
from huggingface_hub import hf_hub_download, snapshot_download
|
||||
from safetensors.torch import load_file
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
AutoTokenizer,
|
||||
CLIPImageProcessor,
|
||||
CLIPVisionModel,
|
||||
CLIPVisionModelWithProjection,
|
||||
UMT5EncoderModel,
|
||||
)
|
||||
from transformers import AutoProcessor, AutoTokenizer, CLIPVisionModelWithProjection, UMT5EncoderModel
|
||||
|
||||
from diffusers import (
|
||||
AutoencoderKLWan,
|
||||
UniPCMultistepScheduler,
|
||||
WanAnimatePipeline,
|
||||
WanAnimateTransformer3DModel,
|
||||
WanImageToVideoPipeline,
|
||||
WanPipeline,
|
||||
WanTransformer3DModel,
|
||||
@@ -114,203 +105,8 @@ VACE_TRANSFORMER_KEYS_RENAME_DICT = {
|
||||
"after_proj": "proj_out",
|
||||
}
|
||||
|
||||
ANIMATE_TRANSFORMER_KEYS_RENAME_DICT = {
|
||||
"time_embedding.0": "condition_embedder.time_embedder.linear_1",
|
||||
"time_embedding.2": "condition_embedder.time_embedder.linear_2",
|
||||
"text_embedding.0": "condition_embedder.text_embedder.linear_1",
|
||||
"text_embedding.2": "condition_embedder.text_embedder.linear_2",
|
||||
"time_projection.1": "condition_embedder.time_proj",
|
||||
"head.modulation": "scale_shift_table",
|
||||
"head.head": "proj_out",
|
||||
"modulation": "scale_shift_table",
|
||||
"ffn.0": "ffn.net.0.proj",
|
||||
"ffn.2": "ffn.net.2",
|
||||
# Hack to swap the layer names
|
||||
# The original model calls the norms in following order: norm1, norm3, norm2
|
||||
# We convert it to: norm1, norm2, norm3
|
||||
"norm2": "norm__placeholder",
|
||||
"norm3": "norm2",
|
||||
"norm__placeholder": "norm3",
|
||||
"img_emb.proj.0": "condition_embedder.image_embedder.norm1",
|
||||
"img_emb.proj.1": "condition_embedder.image_embedder.ff.net.0.proj",
|
||||
"img_emb.proj.3": "condition_embedder.image_embedder.ff.net.2",
|
||||
"img_emb.proj.4": "condition_embedder.image_embedder.norm2",
|
||||
# Add attention component mappings
|
||||
"self_attn.q": "attn1.to_q",
|
||||
"self_attn.k": "attn1.to_k",
|
||||
"self_attn.v": "attn1.to_v",
|
||||
"self_attn.o": "attn1.to_out.0",
|
||||
"self_attn.norm_q": "attn1.norm_q",
|
||||
"self_attn.norm_k": "attn1.norm_k",
|
||||
"cross_attn.q": "attn2.to_q",
|
||||
"cross_attn.k": "attn2.to_k",
|
||||
"cross_attn.v": "attn2.to_v",
|
||||
"cross_attn.o": "attn2.to_out.0",
|
||||
"cross_attn.norm_q": "attn2.norm_q",
|
||||
"cross_attn.norm_k": "attn2.norm_k",
|
||||
"cross_attn.k_img": "attn2.to_k_img",
|
||||
"cross_attn.v_img": "attn2.to_v_img",
|
||||
"cross_attn.norm_k_img": "attn2.norm_k_img",
|
||||
# After cross_attn -> attn2 rename, we need to rename the img keys
|
||||
"attn2.to_k_img": "attn2.add_k_proj",
|
||||
"attn2.to_v_img": "attn2.add_v_proj",
|
||||
"attn2.norm_k_img": "attn2.norm_added_k",
|
||||
# Wan Animate-specific mappings (motion encoder, face encoder, face adapter)
|
||||
# Motion encoder mappings
|
||||
# The name mapping is complicated for the convolutional part so we handle that in its own function
|
||||
"motion_encoder.enc.fc": "motion_encoder.motion_network",
|
||||
"motion_encoder.dec.direction.weight": "motion_encoder.motion_synthesis_weight",
|
||||
# Face encoder mappings - CausalConv1d has a .conv submodule that we need to flatten
|
||||
"face_encoder.conv1_local.conv": "face_encoder.conv1_local",
|
||||
"face_encoder.conv2.conv": "face_encoder.conv2",
|
||||
"face_encoder.conv3.conv": "face_encoder.conv3",
|
||||
# Face adapter mappings are handled in a separate function
|
||||
}
|
||||
|
||||
|
||||
# TODO: Verify this and simplify if possible.
|
||||
def convert_animate_motion_encoder_weights(key: str, state_dict: Dict[str, Any], final_conv_idx: int = 8) -> None:
|
||||
"""
|
||||
Convert all motion encoder weights for Animate model.
|
||||
|
||||
In the original model:
|
||||
- All Linear layers in fc use EqualLinear
|
||||
- All Conv2d layers in convs use EqualConv2d (except blur_conv which is initialized separately)
|
||||
- Blur kernels are stored as buffers in Sequential modules
|
||||
- ConvLayer is nn.Sequential with indices: [Blur (optional), EqualConv2d, FusedLeakyReLU (optional)]
|
||||
|
||||
Conversion strategy:
|
||||
1. Drop .kernel buffers (blur kernels)
|
||||
2. Rename sequential indices to named components (e.g., 0 -> conv2d, 1 -> bias_leaky_relu)
|
||||
"""
|
||||
# Skip if not a weight, bias, or kernel
|
||||
if ".weight" not in key and ".bias" not in key and ".kernel" not in key:
|
||||
return
|
||||
|
||||
# Handle Blur kernel buffers from original implementation.
|
||||
# After renaming, these appear under: motion_encoder.res_blocks.*.conv{2,skip}.blur_kernel
|
||||
# Diffusers constructs blur kernels as a non-persistent buffer so we must drop these keys
|
||||
if ".kernel" in key and "motion_encoder" in key:
|
||||
# Remove unexpected blur kernel buffers to avoid strict load errors
|
||||
state_dict.pop(key, None)
|
||||
return
|
||||
|
||||
# Rename Sequential indices to named components in ConvLayer and ResBlock
|
||||
if ".enc.net_app.convs." in key and (".weight" in key or ".bias" in key):
|
||||
parts = key.split(".")
|
||||
|
||||
# Find the sequential index (digit) after convs or after conv1/conv2/skip
|
||||
# Examples:
|
||||
# - enc.net_app.convs.0.0.weight -> conv_in.weight (initial conv layer weight)
|
||||
# - enc.net_app.convs.0.1.bias -> conv_in.act_fn.bias (initial conv layer bias)
|
||||
# - enc.net_app.convs.{n:1-7}.conv1.0.weight -> res_blocks.{(n-1):0-6}.conv1.weight (conv1 weight)
|
||||
# - e.g. enc.net_app.convs.1.conv1.0.weight -> res_blocks.0.conv1.weight
|
||||
# - enc.net_app.convs.{n:1-7}.conv1.1.bias -> res_blocks.{(n-1):0-6}.conv1.act_fn.bias (conv1 bias)
|
||||
# - e.g. enc.net_app.convs.1.conv1.1.bias -> res_blocks.0.conv1.act_fn.bias
|
||||
# - enc.net_app.convs.{n:1-7}.conv2.1.weight -> res_blocks.{(n-1):0-6}.conv2.weight (conv2 weight)
|
||||
# - enc.net_app.convs.1.conv2.2.bias -> res_blocks.0.conv2.act_fn.bias (conv2 bias)
|
||||
# - enc.net_app.convs.{n:1-7}.skip.1.weight -> res_blocks.{(n-1):0-6}.conv_skip.weight (skip conv weight)
|
||||
# - enc.net_app.convs.8 -> conv_out (final conv layer)
|
||||
|
||||
convs_idx = parts.index("convs") if "convs" in parts else -1
|
||||
if convs_idx >= 0 and len(parts) - convs_idx >= 2:
|
||||
bias = False
|
||||
# The nn.Sequential index will always follow convs
|
||||
sequential_idx = int(parts[convs_idx + 1])
|
||||
if sequential_idx == 0:
|
||||
if key.endswith(".weight"):
|
||||
new_key = "motion_encoder.conv_in.weight"
|
||||
elif key.endswith(".bias"):
|
||||
new_key = "motion_encoder.conv_in.act_fn.bias"
|
||||
bias = True
|
||||
elif sequential_idx == final_conv_idx:
|
||||
if key.endswith(".weight"):
|
||||
new_key = "motion_encoder.conv_out.weight"
|
||||
else:
|
||||
# Intermediate .convs. layers, which get mapped to .res_blocks.
|
||||
prefix = "motion_encoder.res_blocks."
|
||||
|
||||
layer_name = parts[convs_idx + 2]
|
||||
if layer_name == "skip":
|
||||
layer_name = "conv_skip"
|
||||
|
||||
if key.endswith(".weight"):
|
||||
param_name = "weight"
|
||||
elif key.endswith(".bias"):
|
||||
param_name = "act_fn.bias"
|
||||
bias = True
|
||||
|
||||
suffix_parts = [str(sequential_idx - 1), layer_name, param_name]
|
||||
suffix = ".".join(suffix_parts)
|
||||
new_key = prefix + suffix
|
||||
|
||||
param = state_dict.pop(key)
|
||||
if bias:
|
||||
param = param.squeeze()
|
||||
state_dict[new_key] = param
|
||||
return
|
||||
return
|
||||
return
|
||||
|
||||
|
||||
def convert_animate_face_adapter_weights(key: str, state_dict: Dict[str, Any]) -> None:
|
||||
"""
|
||||
Convert face adapter weights for the Animate model.
|
||||
|
||||
The original model uses a fused KV projection but the diffusers models uses separate K and V projections.
|
||||
"""
|
||||
# Skip if not a weight or bias
|
||||
if ".weight" not in key and ".bias" not in key:
|
||||
return
|
||||
|
||||
prefix = "face_adapter."
|
||||
if ".fuser_blocks." in key:
|
||||
parts = key.split(".")
|
||||
|
||||
module_list_idx = parts.index("fuser_blocks") if "fuser_blocks" in parts else -1
|
||||
if module_list_idx >= 0 and (len(parts) - 1) - module_list_idx == 3:
|
||||
block_idx = parts[module_list_idx + 1]
|
||||
layer_name = parts[module_list_idx + 2]
|
||||
param_name = parts[module_list_idx + 3]
|
||||
|
||||
if layer_name == "linear1_kv":
|
||||
layer_name_k = "to_k"
|
||||
layer_name_v = "to_v"
|
||||
|
||||
suffix_k = ".".join([block_idx, layer_name_k, param_name])
|
||||
suffix_v = ".".join([block_idx, layer_name_v, param_name])
|
||||
new_key_k = prefix + suffix_k
|
||||
new_key_v = prefix + suffix_v
|
||||
|
||||
kv_proj = state_dict.pop(key)
|
||||
k_proj, v_proj = torch.chunk(kv_proj, 2, dim=0)
|
||||
state_dict[new_key_k] = k_proj
|
||||
state_dict[new_key_v] = v_proj
|
||||
return
|
||||
else:
|
||||
if layer_name == "q_norm":
|
||||
new_layer_name = "norm_q"
|
||||
elif layer_name == "k_norm":
|
||||
new_layer_name = "norm_k"
|
||||
elif layer_name == "linear1_q":
|
||||
new_layer_name = "to_q"
|
||||
elif layer_name == "linear2":
|
||||
new_layer_name = "to_out"
|
||||
|
||||
suffix_parts = [block_idx, new_layer_name, param_name]
|
||||
suffix = ".".join(suffix_parts)
|
||||
new_key = prefix + suffix
|
||||
state_dict[new_key] = state_dict.pop(key)
|
||||
return
|
||||
return
|
||||
|
||||
|
||||
TRANSFORMER_SPECIAL_KEYS_REMAP = {}
|
||||
VACE_TRANSFORMER_SPECIAL_KEYS_REMAP = {}
|
||||
ANIMATE_TRANSFORMER_SPECIAL_KEYS_REMAP = {
|
||||
"motion_encoder": convert_animate_motion_encoder_weights,
|
||||
"face_adapter": convert_animate_face_adapter_weights,
|
||||
}
|
||||
|
||||
|
||||
def update_state_dict_(state_dict: Dict[str, Any], old_key: str, new_key: str) -> Dict[str, Any]:
|
||||
@@ -568,37 +364,6 @@ def get_transformer_config(model_type: str) -> Tuple[Dict[str, Any], ...]:
|
||||
}
|
||||
RENAME_DICT = TRANSFORMER_KEYS_RENAME_DICT
|
||||
SPECIAL_KEYS_REMAP = TRANSFORMER_SPECIAL_KEYS_REMAP
|
||||
elif model_type == "Wan2.2-Animate-14B":
|
||||
config = {
|
||||
"model_id": "Wan-AI/Wan2.2-Animate-14B",
|
||||
"diffusers_config": {
|
||||
"image_dim": 1280,
|
||||
"added_kv_proj_dim": 5120,
|
||||
"attention_head_dim": 128,
|
||||
"cross_attn_norm": True,
|
||||
"eps": 1e-06,
|
||||
"ffn_dim": 13824,
|
||||
"freq_dim": 256,
|
||||
"in_channels": 36,
|
||||
"num_attention_heads": 40,
|
||||
"num_layers": 40,
|
||||
"out_channels": 16,
|
||||
"patch_size": (1, 2, 2),
|
||||
"qk_norm": "rms_norm_across_heads",
|
||||
"text_dim": 4096,
|
||||
"rope_max_seq_len": 1024,
|
||||
"pos_embed_seq_len": None,
|
||||
"motion_encoder_size": 512, # Start of Wan Animate-specific configs
|
||||
"motion_style_dim": 512,
|
||||
"motion_dim": 20,
|
||||
"motion_encoder_dim": 512,
|
||||
"face_encoder_hidden_dim": 1024,
|
||||
"face_encoder_num_heads": 4,
|
||||
"inject_face_latents_blocks": 5,
|
||||
},
|
||||
}
|
||||
RENAME_DICT = ANIMATE_TRANSFORMER_KEYS_RENAME_DICT
|
||||
SPECIAL_KEYS_REMAP = ANIMATE_TRANSFORMER_SPECIAL_KEYS_REMAP
|
||||
return config, RENAME_DICT, SPECIAL_KEYS_REMAP
|
||||
|
||||
|
||||
@@ -615,12 +380,10 @@ def convert_transformer(model_type: str, stage: str = None):
|
||||
original_state_dict = load_sharded_safetensors(model_dir)
|
||||
|
||||
with init_empty_weights():
|
||||
if "Animate" in model_type:
|
||||
transformer = WanAnimateTransformer3DModel.from_config(diffusers_config)
|
||||
elif "VACE" in model_type:
|
||||
transformer = WanVACETransformer3DModel.from_config(diffusers_config)
|
||||
else:
|
||||
if "VACE" not in model_type:
|
||||
transformer = WanTransformer3DModel.from_config(diffusers_config)
|
||||
else:
|
||||
transformer = WanVACETransformer3DModel.from_config(diffusers_config)
|
||||
|
||||
for key in list(original_state_dict.keys()):
|
||||
new_key = key[:]
|
||||
@@ -634,12 +397,7 @@ def convert_transformer(model_type: str, stage: str = None):
|
||||
continue
|
||||
handler_fn_inplace(key, original_state_dict)
|
||||
|
||||
# Load state dict into the meta model, which will materialize the tensors
|
||||
transformer.load_state_dict(original_state_dict, strict=True, assign=True)
|
||||
|
||||
# Move to CPU to ensure all tensors are materialized
|
||||
transformer = transformer.to("cpu")
|
||||
|
||||
return transformer
|
||||
|
||||
|
||||
@@ -1168,7 +926,7 @@ DTYPE_MAPPING = {
|
||||
if __name__ == "__main__":
|
||||
args = get_args()
|
||||
|
||||
if "Wan2.2" in args.model_type and "TI2V" not in args.model_type and "Animate" not in args.model_type:
|
||||
if "Wan2.2" in args.model_type and "TI2V" not in args.model_type:
|
||||
transformer = convert_transformer(args.model_type, stage="high_noise_model")
|
||||
transformer_2 = convert_transformer(args.model_type, stage="low_noise_model")
|
||||
else:
|
||||
@@ -1184,7 +942,7 @@ if __name__ == "__main__":
|
||||
tokenizer = AutoTokenizer.from_pretrained("google/umt5-xxl")
|
||||
if "FLF2V" in args.model_type:
|
||||
flow_shift = 16.0
|
||||
elif "TI2V" in args.model_type or "Animate" in args.model_type:
|
||||
elif "TI2V" in args.model_type:
|
||||
flow_shift = 5.0
|
||||
else:
|
||||
flow_shift = 3.0
|
||||
@@ -1196,8 +954,6 @@ if __name__ == "__main__":
|
||||
if args.dtype != "none":
|
||||
dtype = DTYPE_MAPPING[args.dtype]
|
||||
transformer.to(dtype)
|
||||
if transformer_2 is not None:
|
||||
transformer_2.to(dtype)
|
||||
|
||||
if "Wan2.2" and "I2V" in args.model_type and "TI2V" not in args.model_type:
|
||||
pipe = WanImageToVideoPipeline(
|
||||
@@ -1260,21 +1016,6 @@ if __name__ == "__main__":
|
||||
vae=vae,
|
||||
scheduler=scheduler,
|
||||
)
|
||||
elif "Animate" in args.model_type:
|
||||
image_encoder = CLIPVisionModel.from_pretrained(
|
||||
"laion/CLIP-ViT-H-14-laion2B-s32B-b79K", torch_dtype=torch.bfloat16
|
||||
)
|
||||
image_processor = CLIPImageProcessor.from_pretrained("laion/CLIP-ViT-H-14-laion2B-s32B-b79K")
|
||||
|
||||
pipe = WanAnimatePipeline(
|
||||
transformer=transformer,
|
||||
text_encoder=text_encoder,
|
||||
tokenizer=tokenizer,
|
||||
vae=vae,
|
||||
scheduler=scheduler,
|
||||
image_encoder=image_encoder,
|
||||
image_processor=image_processor,
|
||||
)
|
||||
else:
|
||||
pipe = WanPipeline(
|
||||
transformer=transformer,
|
||||
|
||||
@@ -268,7 +268,6 @@ else:
|
||||
"UNetSpatioTemporalConditionModel",
|
||||
"UVit2DModel",
|
||||
"VQModel",
|
||||
"WanAnimateTransformer3DModel",
|
||||
"WanTransformer3DModel",
|
||||
"WanVACETransformer3DModel",
|
||||
"attention_backend",
|
||||
@@ -637,7 +636,6 @@ else:
|
||||
"VisualClozeGenerationPipeline",
|
||||
"VisualClozePipeline",
|
||||
"VQDiffusionPipeline",
|
||||
"WanAnimatePipeline",
|
||||
"WanImageToVideoPipeline",
|
||||
"WanPipeline",
|
||||
"WanVACEPipeline",
|
||||
@@ -979,7 +977,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
UNetSpatioTemporalConditionModel,
|
||||
UVit2DModel,
|
||||
VQModel,
|
||||
WanAnimateTransformer3DModel,
|
||||
WanTransformer3DModel,
|
||||
WanVACETransformer3DModel,
|
||||
attention_backend,
|
||||
@@ -1318,7 +1315,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
VisualClozeGenerationPipeline,
|
||||
VisualClozePipeline,
|
||||
VQDiffusionPipeline,
|
||||
WanAnimatePipeline,
|
||||
WanImageToVideoPipeline,
|
||||
WanPipeline,
|
||||
WanVACEPipeline,
|
||||
|
||||
@@ -373,7 +373,7 @@ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
||||
r"""
|
||||
Rescales `noise_cfg` tensor based on `guidance_rescale` to improve image quality and fix overexposure. Based on
|
||||
Section 3.4 from [Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://huggingface.co/papers/2305.08891).
|
||||
Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
||||
|
||||
Args:
|
||||
noise_cfg (`torch.Tensor`):
|
||||
|
||||
@@ -409,7 +409,7 @@ class VaeImageProcessor(ConfigMixin):
|
||||
src_w = width if ratio < src_ratio else image.width * height // image.height
|
||||
src_h = height if ratio >= src_ratio else image.height * width // image.width
|
||||
|
||||
resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION[self.config.resample])
|
||||
resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION["lanczos"])
|
||||
res = Image.new("RGB", (width, height))
|
||||
res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
|
||||
|
||||
@@ -460,7 +460,7 @@ class VaeImageProcessor(ConfigMixin):
|
||||
src_w = width if ratio > src_ratio else image.width * height // image.height
|
||||
src_h = height if ratio <= src_ratio else image.height * width // image.width
|
||||
|
||||
resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION[self.config.resample])
|
||||
resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION["lanczos"])
|
||||
res = Image.new("RGB", (width, height))
|
||||
res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
|
||||
return res
|
||||
|
||||
@@ -108,7 +108,6 @@ if is_torch_available():
|
||||
_import_structure["transformers.transformer_skyreels_v2"] = ["SkyReelsV2Transformer3DModel"]
|
||||
_import_structure["transformers.transformer_temporal"] = ["TransformerTemporalModel"]
|
||||
_import_structure["transformers.transformer_wan"] = ["WanTransformer3DModel"]
|
||||
_import_structure["transformers.transformer_wan_animate"] = ["WanAnimateTransformer3DModel"]
|
||||
_import_structure["transformers.transformer_wan_vace"] = ["WanVACETransformer3DModel"]
|
||||
_import_structure["unets.unet_1d"] = ["UNet1DModel"]
|
||||
_import_structure["unets.unet_2d"] = ["UNet2DModel"]
|
||||
@@ -215,7 +214,6 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
T5FilmDecoder,
|
||||
Transformer2DModel,
|
||||
TransformerTemporalModel,
|
||||
WanAnimateTransformer3DModel,
|
||||
WanTransformer3DModel,
|
||||
WanVACETransformer3DModel,
|
||||
)
|
||||
|
||||
@@ -383,18 +383,12 @@ def _check_shape(
|
||||
attn_mask: Optional[torch.Tensor] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
# Expected shapes:
|
||||
# query: (batch_size, seq_len_q, num_heads, head_dim)
|
||||
# key: (batch_size, seq_len_kv, num_heads, head_dim)
|
||||
# value: (batch_size, seq_len_kv, num_heads, head_dim)
|
||||
# attn_mask: (seq_len_q, seq_len_kv) or (batch_size, seq_len_q, seq_len_kv)
|
||||
# or (batch_size, num_heads, seq_len_q, seq_len_kv)
|
||||
if query.shape[-1] != key.shape[-1]:
|
||||
raise ValueError("Query and key must have the same head dimension.")
|
||||
if key.shape[-3] != value.shape[-3]:
|
||||
raise ValueError("Key and value must have the same sequence length.")
|
||||
if attn_mask is not None and attn_mask.shape[-1] != key.shape[-3]:
|
||||
raise ValueError("Attention mask must match the key's sequence length.")
|
||||
raise ValueError("Query and key must have the same last dimension.")
|
||||
if query.shape[-2] != value.shape[-2]:
|
||||
raise ValueError("Query and value must have the same second to last dimension.")
|
||||
if attn_mask is not None and attn_mask.shape[-1] != key.shape[-2]:
|
||||
raise ValueError("Attention mask must match the key's second to last dimension.")
|
||||
|
||||
|
||||
# ===== Helper functions =====
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
# QwenImageVAE is further fine-tuned from the Wan Video VAE to achieve improved performance.
|
||||
# For more information about the Wan VAE, please refer to:
|
||||
# - GitHub: https://github.com/Wan-Video/Wan2.1
|
||||
# - Paper: https://huggingface.co/papers/2503.20314
|
||||
# - arXiv: https://arxiv.org/abs/2503.20314
|
||||
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
|
||||
@@ -42,5 +42,4 @@ if is_torch_available():
|
||||
from .transformer_skyreels_v2 import SkyReelsV2Transformer3DModel
|
||||
from .transformer_temporal import TransformerTemporalModel
|
||||
from .transformer_wan import WanTransformer3DModel
|
||||
from .transformer_wan_animate import WanAnimateTransformer3DModel
|
||||
from .transformer_wan_vace import WanVACETransformer3DModel
|
||||
|
||||
@@ -275,12 +275,7 @@ class PRXEmbedND(nn.Module):
|
||||
|
||||
def rope(self, pos: torch.Tensor, dim: int, theta: int) -> torch.Tensor:
|
||||
assert dim % 2 == 0
|
||||
|
||||
is_mps = pos.device.type == "mps"
|
||||
is_npu = pos.device.type == "npu"
|
||||
dtype = torch.float32 if (is_mps or is_npu) else torch.float64
|
||||
|
||||
scale = torch.arange(0, dim, 2, dtype=dtype, device=pos.device) / dim
|
||||
scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
|
||||
omega = 1.0 / (theta**scale)
|
||||
out = pos.unsqueeze(-1) * omega.unsqueeze(0)
|
||||
out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
|
||||
|
||||
@@ -188,11 +188,6 @@ class WanRotaryPosEmbed(nn.Module):
|
||||
|
||||
h_dim = w_dim = 2 * (attention_head_dim // 6)
|
||||
t_dim = attention_head_dim - h_dim - w_dim
|
||||
|
||||
self.t_dim = t_dim
|
||||
self.h_dim = h_dim
|
||||
self.w_dim = w_dim
|
||||
|
||||
freqs_dtype = torch.float32 if torch.backends.mps.is_available() else torch.float64
|
||||
|
||||
freqs_cos = []
|
||||
@@ -218,7 +213,11 @@ class WanRotaryPosEmbed(nn.Module):
|
||||
p_t, p_h, p_w = self.patch_size
|
||||
ppf, pph, ppw = num_frames // p_t, height // p_h, width // p_w
|
||||
|
||||
split_sizes = [self.t_dim, self.h_dim, self.w_dim]
|
||||
split_sizes = [
|
||||
self.attention_head_dim - 2 * (self.attention_head_dim // 3),
|
||||
self.attention_head_dim // 3,
|
||||
self.attention_head_dim // 3,
|
||||
]
|
||||
|
||||
freqs_cos = self.freqs_cos.split(split_sizes, dim=1)
|
||||
freqs_sin = self.freqs_sin.split(split_sizes, dim=1)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -861,10 +861,6 @@ class SequentialPipelineBlocks(ModularPipelineBlocks):
|
||||
else:
|
||||
sub_blocks[block_name] = block
|
||||
self.sub_blocks = sub_blocks
|
||||
if not len(self.block_names) == len(self.block_classes):
|
||||
raise ValueError(
|
||||
f"In {self.__class__.__name__}, the number of block_names and block_classes must be the same."
|
||||
)
|
||||
|
||||
def _get_inputs(self):
|
||||
inputs = []
|
||||
|
||||
@@ -132,7 +132,6 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
|
||||
@property
|
||||
def inputs(self) -> List[InputParam]:
|
||||
return [
|
||||
InputParam("latents"),
|
||||
InputParam(name="height"),
|
||||
InputParam(name="width"),
|
||||
InputParam(name="num_images_per_prompt", default=1),
|
||||
@@ -197,11 +196,11 @@ class QwenImagePrepareLatentsStep(ModularPipelineBlocks):
|
||||
f"You have passed a list of generators of length {len(block_state.generator)}, but requested an effective batch"
|
||||
f" size of {batch_size}. Make sure the batch size matches the length of the generators."
|
||||
)
|
||||
if block_state.latents is None:
|
||||
block_state.latents = randn_tensor(
|
||||
shape, generator=block_state.generator, device=device, dtype=block_state.dtype
|
||||
)
|
||||
block_state.latents = components.pachifier.pack_latents(block_state.latents)
|
||||
|
||||
block_state.latents = randn_tensor(
|
||||
shape, generator=block_state.generator, device=device, dtype=block_state.dtype
|
||||
)
|
||||
block_state.latents = components.pachifier.pack_latents(block_state.latents)
|
||||
|
||||
self.set_block_state(state, block_state)
|
||||
return components, state
|
||||
@@ -550,7 +549,8 @@ class QwenImageRoPEInputsStep(ModularPipelineBlocks):
|
||||
block_state.width // components.vae_scale_factor // 2,
|
||||
)
|
||||
]
|
||||
] * block_state.batch_size
|
||||
* block_state.batch_size
|
||||
]
|
||||
block_state.txt_seq_lens = (
|
||||
block_state.prompt_embeds_mask.sum(dim=1).tolist() if block_state.prompt_embeds_mask is not None else None
|
||||
)
|
||||
|
||||
@@ -74,9 +74,8 @@ class QwenImageDecoderStep(ModularPipelineBlocks):
|
||||
block_state = self.get_block_state(state)
|
||||
|
||||
# YiYi Notes: remove support for output_type = "latents', we can just skip decode/encode step in modular
|
||||
vae_scale_factor = components.vae_scale_factor
|
||||
block_state.latents = components.pachifier.unpack_latents(
|
||||
block_state.latents, block_state.height, block_state.width, vae_scale_factor=vae_scale_factor
|
||||
block_state.latents, block_state.height, block_state.width
|
||||
)
|
||||
block_state.latents = block_state.latents.to(components.vae.dtype)
|
||||
|
||||
|
||||
@@ -503,8 +503,6 @@ class QwenImageTextEncoderStep(ModularPipelineBlocks):
|
||||
block_state.prompt_embeds = block_state.prompt_embeds[:, : block_state.max_sequence_length]
|
||||
block_state.prompt_embeds_mask = block_state.prompt_embeds_mask[:, : block_state.max_sequence_length]
|
||||
|
||||
block_state.negative_prompt_embeds = None
|
||||
block_state.negative_prompt_embeds_mask = None
|
||||
if components.requires_unconditional_embeds:
|
||||
negative_prompt = block_state.negative_prompt or ""
|
||||
block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds(
|
||||
@@ -629,8 +627,6 @@ class QwenImageEditTextEncoderStep(ModularPipelineBlocks):
|
||||
device=device,
|
||||
)
|
||||
|
||||
block_state.negative_prompt_embeds = None
|
||||
block_state.negative_prompt_embeds_mask = None
|
||||
if components.requires_unconditional_embeds:
|
||||
negative_prompt = block_state.negative_prompt or " "
|
||||
block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = get_qwen_prompt_embeds_edit(
|
||||
@@ -683,8 +679,6 @@ class QwenImageEditPlusTextEncoderStep(QwenImageEditTextEncoderStep):
|
||||
device=device,
|
||||
)
|
||||
|
||||
block_state.negative_prompt_embeds = None
|
||||
block_state.negative_prompt_embeds_mask = None
|
||||
if components.requires_unconditional_embeds:
|
||||
negative_prompt = block_state.negative_prompt or " "
|
||||
block_state.negative_prompt_embeds, block_state.negative_prompt_embeds_mask = (
|
||||
|
||||
@@ -523,7 +523,7 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
QwenImageOptionalControlNetBeforeDenoiseStep,
|
||||
QwenImageAutoDenoiseStep,
|
||||
]
|
||||
block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise"]
|
||||
block_names = ["input", "controlnet_input", "before_denoise", "controlnet_before_denoise", "denoise", "decode"]
|
||||
|
||||
@property
|
||||
def description(self):
|
||||
@@ -534,6 +534,7 @@ class QwenImageCoreDenoiseStep(SequentialPipelineBlocks):
|
||||
+ " - `QwenImageAutoBeforeDenoiseStep` (before_denoise) prepares the inputs for the denoising step.\n"
|
||||
+ " - `QwenImageOptionalControlNetBeforeDenoiseStep` (controlnet_before_denoise) prepares the controlnet input for the denoising step.\n"
|
||||
+ " - `QwenImageAutoDenoiseStep` (denoise) iteratively denoises the latents.\n"
|
||||
+ " - `QwenImageAutoDecodeStep` (decode) decodes the latents into images.\n\n"
|
||||
+ "This step support text-to-image, image-to-image, inpainting, and controlnet tasks for QwenImage:\n"
|
||||
+ " - for image-to-image generation, you need to provide `image_latents`\n"
|
||||
+ " - for inpainting, you need to provide `processed_mask_image` and `image_latents`\n"
|
||||
|
||||
@@ -26,7 +26,10 @@ class QwenImagePachifier(ConfigMixin):
|
||||
config_name = "config.json"
|
||||
|
||||
@register_to_config
|
||||
def __init__(self, patch_size: int = 2):
|
||||
def __init__(
|
||||
self,
|
||||
patch_size: int = 2,
|
||||
):
|
||||
super().__init__()
|
||||
|
||||
def pack_latents(self, latents):
|
||||
|
||||
@@ -385,13 +385,7 @@ else:
|
||||
"WuerstchenDecoderPipeline",
|
||||
"WuerstchenPriorPipeline",
|
||||
]
|
||||
_import_structure["wan"] = [
|
||||
"WanPipeline",
|
||||
"WanImageToVideoPipeline",
|
||||
"WanVideoToVideoPipeline",
|
||||
"WanVACEPipeline",
|
||||
"WanAnimatePipeline",
|
||||
]
|
||||
_import_structure["wan"] = ["WanPipeline", "WanImageToVideoPipeline", "WanVideoToVideoPipeline", "WanVACEPipeline"]
|
||||
_import_structure["kandinsky5"] = ["Kandinsky5T2VPipeline"]
|
||||
_import_structure["skyreels_v2"] = [
|
||||
"SkyReelsV2DiffusionForcingPipeline",
|
||||
@@ -809,13 +803,7 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
UniDiffuserTextDecoder,
|
||||
)
|
||||
from .visualcloze import VisualClozeGenerationPipeline, VisualClozePipeline
|
||||
from .wan import (
|
||||
WanAnimatePipeline,
|
||||
WanImageToVideoPipeline,
|
||||
WanPipeline,
|
||||
WanVACEPipeline,
|
||||
WanVideoToVideoPipeline,
|
||||
)
|
||||
from .wan import WanImageToVideoPipeline, WanPipeline, WanVACEPipeline, WanVideoToVideoPipeline
|
||||
from .wuerstchen import (
|
||||
WuerstchenCombinedPipeline,
|
||||
WuerstchenDecoderPipeline,
|
||||
|
||||
@@ -245,7 +245,7 @@ class BriaPipeline(DiffusionPipeline):
|
||||
return self._guidance_scale
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
# corresponds to doing no classifier free guidance.
|
||||
@property
|
||||
def do_classifier_free_guidance(self):
|
||||
@@ -489,11 +489,11 @@ class BriaPipeline(DiffusionPipeline):
|
||||
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
||||
passed will be used. Must be in descending order.
|
||||
guidance_scale (`float`, *optional*, defaults to 5.0):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion
|
||||
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
||||
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
||||
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
||||
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
||||
|
||||
@@ -337,7 +337,7 @@ class BriaFiboPipeline(DiffusionPipeline):
|
||||
return self._guidance_scale
|
||||
|
||||
# here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
|
||||
# of the Imagen paper: https://huggingface.co/papers/2205.11487 . `guidance_scale = 1`
|
||||
# of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
|
||||
# corresponds to doing no classifier free guidance.
|
||||
|
||||
@property
|
||||
@@ -498,11 +498,11 @@ class BriaFiboPipeline(DiffusionPipeline):
|
||||
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
||||
passed will be used. Must be in descending order.
|
||||
guidance_scale (`float`, *optional*, defaults to 5.0):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion
|
||||
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
||||
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
||||
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
negative_prompt (`str` or `List[str]`, *optional*):
|
||||
The prompt or prompts not to guide the image generation. If not defined, one has to pass
|
||||
`negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
|
||||
|
||||
@@ -590,10 +590,9 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
||||
Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
|
||||
[Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
|
||||
using zero terminal SNR.
|
||||
Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
|
||||
[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
||||
Guidance rescale factor should fix overexposure when using zero terminal SNR.
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of videos to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
@@ -778,7 +777,7 @@ class LTXPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraLoaderMixi
|
||||
noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
|
||||
|
||||
if self.guidance_rescale > 0:
|
||||
# Based on 3.4. in https://huggingface.co/papers/2305.08891
|
||||
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
|
||||
noise_pred = rescale_noise_cfg(
|
||||
noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
|
||||
)
|
||||
|
||||
@@ -927,10 +927,9 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
||||
Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
|
||||
[Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
|
||||
using zero terminal SNR.
|
||||
Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
|
||||
[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
||||
Guidance rescale factor should fix overexposure when using zero terminal SNR.
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of videos to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
@@ -1195,7 +1194,7 @@ class LTXConditionPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLoraL
|
||||
timestep, _ = timestep.chunk(2)
|
||||
|
||||
if self.guidance_rescale > 0:
|
||||
# Based on 3.4. in https://huggingface.co/papers/2305.08891
|
||||
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
|
||||
noise_pred = rescale_noise_cfg(
|
||||
noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
|
||||
)
|
||||
|
||||
@@ -654,10 +654,9 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
guidance_rescale (`float`, *optional*, defaults to 0.0):
|
||||
Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://huggingface.co/papers/2305.08891) `guidance_scale` is defined as `φ` in equation 16. of
|
||||
[Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://huggingface.co/papers/2305.08891). Guidance rescale factor should fix overexposure when
|
||||
using zero terminal SNR.
|
||||
Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
|
||||
[Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
|
||||
Guidance rescale factor should fix overexposure when using zero terminal SNR.
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of videos to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
@@ -852,7 +851,7 @@ class LTXImageToVideoPipeline(DiffusionPipeline, FromSingleFileMixin, LTXVideoLo
|
||||
timestep, _ = timestep.chunk(2)
|
||||
|
||||
if self.guidance_rescale > 0:
|
||||
# Based on 3.4. in https://huggingface.co/papers/2305.08891
|
||||
# Based on 3.4. in https://arxiv.org/pdf/2305.08891.pdf
|
||||
noise_pred = rescale_noise_cfg(
|
||||
noise_pred, noise_pred_text, guidance_rescale=self.guidance_rescale
|
||||
)
|
||||
|
||||
@@ -69,39 +69,6 @@ ASPECT_RATIO_512_BIN = {
|
||||
"2.0": [704, 352],
|
||||
}
|
||||
|
||||
ASPECT_RATIO_1024_BIN = {
|
||||
"0.49": [704, 1440],
|
||||
"0.52": [736, 1408],
|
||||
"0.53": [736, 1376],
|
||||
"0.57": [768, 1344],
|
||||
"0.59": [768, 1312],
|
||||
"0.62": [800, 1280],
|
||||
"0.67": [832, 1248],
|
||||
"0.68": [832, 1216],
|
||||
"0.78": [896, 1152],
|
||||
"0.83": [928, 1120],
|
||||
"0.94": [992, 1056],
|
||||
"1.0": [1024, 1024],
|
||||
"1.06": [1056, 992],
|
||||
"1.13": [1088, 960],
|
||||
"1.21": [1120, 928],
|
||||
"1.29": [1152, 896],
|
||||
"1.37": [1184, 864],
|
||||
"1.46": [1216, 832],
|
||||
"1.5": [1248, 832],
|
||||
"1.71": [1312, 768],
|
||||
"1.75": [1344, 768],
|
||||
"1.87": [1376, 736],
|
||||
"1.91": [1408, 736],
|
||||
"2.05": [1440, 704],
|
||||
}
|
||||
|
||||
ASPECT_RATIO_BINS = {
|
||||
256: ASPECT_RATIO_256_BIN,
|
||||
512: ASPECT_RATIO_512_BIN,
|
||||
1024: ASPECT_RATIO_1024_BIN,
|
||||
}
|
||||
|
||||
logger = logging.get_logger(__name__)
|
||||
|
||||
|
||||
@@ -569,11 +536,11 @@ class PRXPipeline(
|
||||
in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
|
||||
passed will be used. Must be in descending order.
|
||||
guidance_scale (`float`, *optional*, defaults to 4.0):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion
|
||||
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
||||
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
||||
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
num_images_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
@@ -633,12 +600,10 @@ class PRXPipeline(
|
||||
"Resolution binning requires a VAE with image_processor, but VAE is not available. "
|
||||
"Set use_resolution_binning=False or provide a VAE."
|
||||
)
|
||||
if self.default_sample_size not in ASPECT_RATIO_BINS:
|
||||
raise ValueError(
|
||||
f"Resolution binning is only supported for default_sample_size in {list(ASPECT_RATIO_BINS.keys())}, "
|
||||
f"but got {self.default_sample_size}. Set use_resolution_binning=False to disable aspect ratio binning."
|
||||
)
|
||||
aspect_ratio_bin = ASPECT_RATIO_BINS[self.default_sample_size]
|
||||
if self.default_sample_size <= 256:
|
||||
aspect_ratio_bin = ASPECT_RATIO_256_BIN
|
||||
else:
|
||||
aspect_ratio_bin = ASPECT_RATIO_512_BIN
|
||||
|
||||
# Store original dimensions
|
||||
orig_height, orig_width = height, width
|
||||
|
||||
@@ -415,11 +415,11 @@ class SkyReelsV2Pipeline(DiffusionPipeline, SkyReelsV2LoraLoaderMixin):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, defaults to `6.0`):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion
|
||||
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
||||
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
||||
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
|
||||
@@ -647,11 +647,11 @@ class SkyReelsV2DiffusionForcingPipeline(DiffusionPipeline, SkyReelsV2LoraLoader
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, defaults to `6.0`):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion
|
||||
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
||||
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
||||
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
||||
the text `prompt`, usually at the expense of lower image quality. (**6.0 for T2V**, **5.0 for I2V**)
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality. (**6.0 for T2V**, **5.0 for I2V**)
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
|
||||
@@ -698,11 +698,11 @@ class SkyReelsV2DiffusionForcingImageToVideoPipeline(DiffusionPipeline, SkyReels
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, defaults to `5.0`):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion
|
||||
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
||||
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
||||
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
||||
the text `prompt`, usually at the expense of lower image quality. (**6.0 for T2V**, **5.0 for I2V**)
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality. (**6.0 for T2V**, **5.0 for I2V**)
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
|
||||
@@ -524,11 +524,11 @@ class SkyReelsV2ImageToVideoPipeline(DiffusionPipeline, SkyReelsV2LoraLoaderMixi
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, defaults to `5.0`):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion
|
||||
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
||||
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
||||
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
num_videos_per_prompt (`int`, *optional*, defaults to 1):
|
||||
The number of images to generate per prompt.
|
||||
generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
|
||||
|
||||
@@ -23,7 +23,6 @@ except OptionalDependencyNotAvailable:
|
||||
_dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
|
||||
else:
|
||||
_import_structure["pipeline_wan"] = ["WanPipeline"]
|
||||
_import_structure["pipeline_wan_animate"] = ["WanAnimatePipeline"]
|
||||
_import_structure["pipeline_wan_i2v"] = ["WanImageToVideoPipeline"]
|
||||
_import_structure["pipeline_wan_vace"] = ["WanVACEPipeline"]
|
||||
_import_structure["pipeline_wan_video2video"] = ["WanVideoToVideoPipeline"]
|
||||
@@ -36,10 +35,10 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
|
||||
from ...utils.dummy_torch_and_transformers_objects import *
|
||||
else:
|
||||
from .pipeline_wan import WanPipeline
|
||||
from .pipeline_wan_animate import WanAnimatePipeline
|
||||
from .pipeline_wan_i2v import WanImageToVideoPipeline
|
||||
from .pipeline_wan_vace import WanVACEPipeline
|
||||
from .pipeline_wan_video2video import WanVideoToVideoPipeline
|
||||
|
||||
else:
|
||||
import sys
|
||||
|
||||
|
||||
@@ -1,185 +0,0 @@
|
||||
# Copyright 2025 The Wan Team and The HuggingFace Team. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import PIL.Image
|
||||
import torch
|
||||
|
||||
from ...configuration_utils import register_to_config
|
||||
from ...image_processor import VaeImageProcessor
|
||||
from ...utils import PIL_INTERPOLATION
|
||||
|
||||
|
||||
class WanAnimateImageProcessor(VaeImageProcessor):
|
||||
r"""
|
||||
Image processor to preprocess the reference (character) image for the Wan Animate model.
|
||||
|
||||
Args:
|
||||
do_resize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to downscale the image's (height, width) dimensions to multiples of `vae_scale_factor`. Can accept
|
||||
`height` and `width` arguments from [`image_processor.VaeImageProcessor.preprocess`] method.
|
||||
vae_scale_factor (`int`, *optional*, defaults to `8`):
|
||||
VAE (spatial) scale factor. If `do_resize` is `True`, the image is automatically resized to multiples of
|
||||
this factor.
|
||||
vae_latent_channels (`int`, *optional*, defaults to `16`):
|
||||
VAE latent channels.
|
||||
spatial_patch_size (`Tuple[int, int]`, *optional*, defaults to `(2, 2)`):
|
||||
The spatial patch size used by the diffusion transformer. For Wan models, this is typically (2, 2).
|
||||
resample (`str`, *optional*, defaults to `lanczos`):
|
||||
Resampling filter to use when resizing the image.
|
||||
do_normalize (`bool`, *optional*, defaults to `True`):
|
||||
Whether to normalize the image to [-1,1].
|
||||
do_binarize (`bool`, *optional*, defaults to `False`):
|
||||
Whether to binarize the image to 0/1.
|
||||
do_convert_rgb (`bool`, *optional*, defaults to be `False`):
|
||||
Whether to convert the images to RGB format.
|
||||
do_convert_grayscale (`bool`, *optional*, defaults to be `False`):
|
||||
Whether to convert the images to grayscale format.
|
||||
fill_color (`str` or `float` or `Tuple[float, ...]`, *optional*, defaults to `None`):
|
||||
An optional fill color when `resize_mode` is set to `"fill"`. This will fill the empty space with that
|
||||
color instead of filling with data from the image. Any valid `color` argument to `PIL.Image.new` is valid;
|
||||
if `None`, will default to filling with data from `image`.
|
||||
"""
|
||||
|
||||
@register_to_config
|
||||
def __init__(
|
||||
self,
|
||||
do_resize: bool = True,
|
||||
vae_scale_factor: int = 8,
|
||||
vae_latent_channels: int = 16,
|
||||
spatial_patch_size: Tuple[int, int] = (2, 2),
|
||||
resample: str = "lanczos",
|
||||
reducing_gap: int = None,
|
||||
do_normalize: bool = True,
|
||||
do_binarize: bool = False,
|
||||
do_convert_rgb: bool = False,
|
||||
do_convert_grayscale: bool = False,
|
||||
fill_color: Optional[Union[str, float, Tuple[float, ...]]] = 0,
|
||||
):
|
||||
super().__init__()
|
||||
if do_convert_rgb and do_convert_grayscale:
|
||||
raise ValueError(
|
||||
"`do_convert_rgb` and `do_convert_grayscale` can not both be set to `True`,"
|
||||
" if you intended to convert the image into RGB format, please set `do_convert_grayscale = False`.",
|
||||
" if you intended to convert the image into grayscale format, please set `do_convert_rgb = False`",
|
||||
)
|
||||
|
||||
def _resize_and_fill(
|
||||
self,
|
||||
image: PIL.Image.Image,
|
||||
width: int,
|
||||
height: int,
|
||||
) -> PIL.Image.Image:
|
||||
r"""
|
||||
Resize the image to fit within the specified width and height, maintaining the aspect ratio, and then center
|
||||
the image within the dimensions, filling empty with data from image.
|
||||
|
||||
Args:
|
||||
image (`PIL.Image.Image`):
|
||||
The image to resize and fill.
|
||||
width (`int`):
|
||||
The width to resize the image to.
|
||||
height (`int`):
|
||||
The height to resize the image to.
|
||||
|
||||
Returns:
|
||||
`PIL.Image.Image`:
|
||||
The resized and filled image.
|
||||
"""
|
||||
|
||||
ratio = width / height
|
||||
src_ratio = image.width / image.height
|
||||
fill_with_image_data = self.config.fill_color is None
|
||||
fill_color = self.config.fill_color or 0
|
||||
|
||||
src_w = width if ratio < src_ratio else image.width * height // image.height
|
||||
src_h = height if ratio >= src_ratio else image.height * width // image.width
|
||||
|
||||
resized = image.resize((src_w, src_h), resample=PIL_INTERPOLATION[self.config.resample])
|
||||
res = PIL.Image.new("RGB", (width, height), color=fill_color)
|
||||
res.paste(resized, box=(width // 2 - src_w // 2, height // 2 - src_h // 2))
|
||||
|
||||
if fill_with_image_data:
|
||||
if ratio < src_ratio:
|
||||
fill_height = height // 2 - src_h // 2
|
||||
if fill_height > 0:
|
||||
res.paste(resized.resize((width, fill_height), box=(0, 0, width, 0)), box=(0, 0))
|
||||
res.paste(
|
||||
resized.resize((width, fill_height), box=(0, resized.height, width, resized.height)),
|
||||
box=(0, fill_height + src_h),
|
||||
)
|
||||
elif ratio > src_ratio:
|
||||
fill_width = width // 2 - src_w // 2
|
||||
if fill_width > 0:
|
||||
res.paste(resized.resize((fill_width, height), box=(0, 0, 0, height)), box=(0, 0))
|
||||
res.paste(
|
||||
resized.resize((fill_width, height), box=(resized.width, 0, resized.width, height)),
|
||||
box=(fill_width + src_w, 0),
|
||||
)
|
||||
|
||||
return res
|
||||
|
||||
def get_default_height_width(
|
||||
self,
|
||||
image: Union[PIL.Image.Image, np.ndarray, torch.Tensor],
|
||||
height: Optional[int] = None,
|
||||
width: Optional[int] = None,
|
||||
) -> Tuple[int, int]:
|
||||
r"""
|
||||
Returns the height and width of the image, downscaled to the next integer multiple of `vae_scale_factor`.
|
||||
|
||||
Args:
|
||||
image (`Union[PIL.Image.Image, np.ndarray, torch.Tensor]`):
|
||||
The image input, which can be a PIL image, NumPy array, or PyTorch tensor. If it is a NumPy array, it
|
||||
should have shape `[batch, height, width]` or `[batch, height, width, channels]`. If it is a PyTorch
|
||||
tensor, it should have shape `[batch, channels, height, width]`.
|
||||
height (`Optional[int]`, *optional*, defaults to `None`):
|
||||
The height of the preprocessed image. If `None`, the height of the `image` input will be used.
|
||||
width (`Optional[int]`, *optional*, defaults to `None`):
|
||||
The width of the preprocessed image. If `None`, the width of the `image` input will be used.
|
||||
|
||||
Returns:
|
||||
`Tuple[int, int]`:
|
||||
A tuple containing the height and width, both resized to the nearest integer multiple of
|
||||
`vae_scale_factor * spatial_patch_size`.
|
||||
"""
|
||||
|
||||
if height is None:
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
height = image.height
|
||||
elif isinstance(image, torch.Tensor):
|
||||
height = image.shape[2]
|
||||
else:
|
||||
height = image.shape[1]
|
||||
|
||||
if width is None:
|
||||
if isinstance(image, PIL.Image.Image):
|
||||
width = image.width
|
||||
elif isinstance(image, torch.Tensor):
|
||||
width = image.shape[3]
|
||||
else:
|
||||
width = image.shape[2]
|
||||
|
||||
max_area = width * height
|
||||
aspect_ratio = height / width
|
||||
mod_value_h = self.config.vae_scale_factor * self.config.spatial_patch_size[0]
|
||||
mod_value_w = self.config.vae_scale_factor * self.config.spatial_patch_size[1]
|
||||
|
||||
# Try to preserve the aspect ratio
|
||||
height = round(np.sqrt(max_area * aspect_ratio)) // mod_value_h * mod_value_h
|
||||
width = round(np.sqrt(max_area / aspect_ratio)) // mod_value_w * mod_value_w
|
||||
|
||||
return height, width
|
||||
File diff suppressed because it is too large
Load Diff
@@ -758,11 +758,11 @@ class WanVACEPipeline(DiffusionPipeline, WanLoraLoaderMixin):
|
||||
The number of denoising steps. More denoising steps usually lead to a higher quality image at the
|
||||
expense of slower inference.
|
||||
guidance_scale (`float`, defaults to `5.0`):
|
||||
Guidance scale as defined in [Classifier-Free Diffusion
|
||||
Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
|
||||
of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
|
||||
`guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
|
||||
the text `prompt`, usually at the expense of lower image quality.
|
||||
Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
|
||||
`guidance_scale` is defined as `w` of equation 2. of [Imagen
|
||||
Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
|
||||
1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
|
||||
usually at the expense of lower image quality.
|
||||
guidance_scale_2 (`float`, *optional*, defaults to `None`):
|
||||
Guidance scale for the low-noise stage transformer (`transformer_2`). If `None` and the pipeline's
|
||||
`boundary_ratio` is not None, uses the same value as `guidance_scale`. Only used when `transformer_2`
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
@@ -9,48 +9,13 @@ from ..utils import BaseOutput
|
||||
from .scheduling_utils import SchedulerMixin
|
||||
|
||||
|
||||
def gumbel_noise(t: torch.Tensor, generator: Optional[torch.Generator] = None) -> torch.Tensor:
|
||||
"""
|
||||
Generate Gumbel noise for sampling.
|
||||
|
||||
Args:
|
||||
t (`torch.Tensor`):
|
||||
Input tensor to match the shape and dtype of the output noise.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator for reproducible sampling.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
Gumbel-distributed noise with the same shape, dtype, and device as the input tensor.
|
||||
"""
|
||||
def gumbel_noise(t, generator=None):
|
||||
device = generator.device if generator is not None else t.device
|
||||
noise = torch.zeros_like(t, device=device).uniform_(0, 1, generator=generator).to(t.device)
|
||||
return -torch.log((-torch.log(noise.clamp(1e-20))).clamp(1e-20))
|
||||
|
||||
|
||||
def mask_by_random_topk(
|
||||
mask_len: torch.Tensor,
|
||||
probs: torch.Tensor,
|
||||
temperature: float = 1.0,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Mask tokens by selecting the top-k lowest confidence scores with temperature-based randomness.
|
||||
|
||||
Args:
|
||||
mask_len (`torch.Tensor`):
|
||||
Number of tokens to mask per sample in the batch.
|
||||
probs (`torch.Tensor`):
|
||||
Probability scores for each token.
|
||||
temperature (`float`, *optional*, defaults to 1.0):
|
||||
Temperature parameter for controlling randomness in the masking process.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator for reproducible sampling.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
Boolean mask indicating which tokens should be masked.
|
||||
"""
|
||||
def mask_by_random_topk(mask_len, probs, temperature=1.0, generator=None):
|
||||
confidence = torch.log(probs.clamp(1e-20)) + temperature * gumbel_noise(probs, generator=generator)
|
||||
sorted_confidence = torch.sort(confidence, dim=-1).values
|
||||
cut_off = torch.gather(sorted_confidence, 1, mask_len.long())
|
||||
@@ -64,46 +29,28 @@ class AmusedSchedulerOutput(BaseOutput):
|
||||
Output class for the scheduler's `step` function output.
|
||||
|
||||
Args:
|
||||
prev_sample (`torch.LongTensor` of shape `(batch_size, height, width)` or `(batch_size, sequence_length)`):
|
||||
Computed sample `(x_{t-1})` of previous timestep with token IDs. `prev_sample` should be used as next model
|
||||
input in the denoising loop.
|
||||
pred_original_sample (`torch.LongTensor` of shape `(batch_size, height, width)` or `(batch_size, sequence_length)`, *optional*):
|
||||
The predicted fully denoised sample `(x_{0})` with token IDs based on the model output from the current
|
||||
timestep. `pred_original_sample` can be used to preview progress or for guidance.
|
||||
prev_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
|
||||
Computed sample `(x_{t-1})` of previous timestep. `prev_sample` should be used as next model input in the
|
||||
denoising loop.
|
||||
pred_original_sample (`torch.Tensor` of shape `(batch_size, num_channels, height, width)` for images):
|
||||
The predicted denoised sample `(x_{0})` based on the model output from the current timestep.
|
||||
`pred_original_sample` can be used to preview progress or for guidance.
|
||||
"""
|
||||
|
||||
prev_sample: torch.Tensor
|
||||
pred_original_sample: Optional[torch.Tensor] = None
|
||||
pred_original_sample: torch.Tensor = None
|
||||
|
||||
|
||||
class AmusedScheduler(SchedulerMixin, ConfigMixin):
|
||||
"""
|
||||
A scheduler for masked token generation as used in [`AmusedPipeline`].
|
||||
|
||||
This scheduler iteratively unmasks tokens based on their confidence scores, following either a cosine or linear
|
||||
schedule. Unlike traditional diffusion schedulers that work with continuous pixel values, this scheduler operates
|
||||
on discrete token IDs, making it suitable for autoregressive and non-autoregressive masked token generation models.
|
||||
|
||||
This scheduler inherits from [`SchedulerMixin`] and [`ConfigMixin`]. Check the superclass documentation for the
|
||||
generic methods the library implements for all schedulers such as loading and saving.
|
||||
|
||||
Args:
|
||||
mask_token_id (`int`):
|
||||
The token ID used to represent masked tokens in the sequence.
|
||||
masking_schedule (`Literal["cosine", "linear"]`, *optional*, defaults to `"cosine"`):
|
||||
The schedule type for determining the mask ratio at each timestep. Can be either `"cosine"` or `"linear"`.
|
||||
"""
|
||||
|
||||
order = 1
|
||||
|
||||
temperatures: Optional[torch.Tensor]
|
||||
timesteps: Optional[torch.Tensor]
|
||||
temperatures: torch.Tensor
|
||||
|
||||
@register_to_config
|
||||
def __init__(
|
||||
self,
|
||||
mask_token_id: int,
|
||||
masking_schedule: Literal["cosine", "linear"] = "cosine",
|
||||
masking_schedule: str = "cosine",
|
||||
):
|
||||
self.temperatures = None
|
||||
self.timesteps = None
|
||||
@@ -111,23 +58,9 @@ class AmusedScheduler(SchedulerMixin, ConfigMixin):
|
||||
def set_timesteps(
|
||||
self,
|
||||
num_inference_steps: int,
|
||||
temperature: Union[float, Tuple[float, float], List[float]] = (2, 0),
|
||||
device: Optional[Union[str, torch.device]] = None,
|
||||
) -> None:
|
||||
"""
|
||||
Set the discrete timesteps used for the diffusion chain (to be run before inference).
|
||||
|
||||
Args:
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model.
|
||||
temperature (`Union[float, Tuple[float, float], List[float]]`, *optional*, defaults to `(2, 0)`):
|
||||
Temperature parameter(s) for controlling the randomness of sampling. If a tuple or list is provided,
|
||||
temperatures will be linearly interpolated between the first and second values across all timesteps. If
|
||||
a single value is provided, temperatures will be linearly interpolated from that value to 0.01.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps and temperatures should be moved to. If `None`, the timesteps are not
|
||||
moved.
|
||||
"""
|
||||
temperature: Union[int, Tuple[int, int], List[int]] = (2, 0),
|
||||
device: Union[str, torch.device] = None,
|
||||
):
|
||||
self.timesteps = torch.arange(num_inference_steps, device=device).flip(0)
|
||||
|
||||
if isinstance(temperature, (tuple, list)):
|
||||
@@ -138,38 +71,12 @@ class AmusedScheduler(SchedulerMixin, ConfigMixin):
|
||||
def step(
|
||||
self,
|
||||
model_output: torch.Tensor,
|
||||
timestep: int,
|
||||
timestep: torch.long,
|
||||
sample: torch.LongTensor,
|
||||
starting_mask_ratio: float = 1.0,
|
||||
starting_mask_ratio: int = 1,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
return_dict: bool = True,
|
||||
) -> Union[AmusedSchedulerOutput, Tuple[torch.Tensor, torch.Tensor]]:
|
||||
"""
|
||||
Predict the sample at the previous timestep by masking tokens based on confidence scores.
|
||||
|
||||
Args:
|
||||
model_output (`torch.Tensor`):
|
||||
The direct output from the learned diffusion model. Typically of shape `(batch_size, num_tokens,
|
||||
codebook_size)` or `(batch_size, codebook_size, height, width)` for 2D inputs.
|
||||
timestep (`int`):
|
||||
The current discrete timestep in the diffusion chain.
|
||||
sample (`torch.LongTensor`):
|
||||
A current instance of a sample created by the diffusion process. Contains token IDs, with masked
|
||||
positions indicated by `mask_token_id`.
|
||||
starting_mask_ratio (`float`, *optional*, defaults to 1.0):
|
||||
A multiplier applied to the mask ratio schedule. Values less than 1.0 will result in fewer tokens being
|
||||
masked at each step.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator for reproducible sampling.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether to return an [`~schedulers.scheduling_amused.AmusedSchedulerOutput`] or a plain tuple.
|
||||
|
||||
Returns:
|
||||
[`~schedulers.scheduling_amused.AmusedSchedulerOutput`] or `tuple`:
|
||||
If `return_dict` is `True`, [`~schedulers.scheduling_amused.AmusedSchedulerOutput`] is returned,
|
||||
otherwise a tuple is returned where the first element is the sample tensor (`prev_sample`) and the
|
||||
second element is the predicted original sample tensor (`pred_original_sample`).
|
||||
"""
|
||||
) -> Union[AmusedSchedulerOutput, Tuple]:
|
||||
two_dim_input = sample.ndim == 3 and model_output.ndim == 4
|
||||
|
||||
if two_dim_input:
|
||||
@@ -230,27 +137,7 @@ class AmusedScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
return AmusedSchedulerOutput(prev_sample, pred_original_sample)
|
||||
|
||||
def add_noise(
|
||||
self,
|
||||
sample: torch.LongTensor,
|
||||
timesteps: int,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
) -> torch.LongTensor:
|
||||
"""
|
||||
Add noise to a sample by randomly masking tokens according to the masking schedule.
|
||||
|
||||
Args:
|
||||
sample (`torch.LongTensor`):
|
||||
The input sample containing token IDs to be partially masked.
|
||||
timesteps (`int`):
|
||||
The timestep that determines how much masking to apply. Higher timesteps result in more masking.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator for reproducible masking.
|
||||
|
||||
Returns:
|
||||
`torch.LongTensor`:
|
||||
The sample with some tokens replaced by `mask_token_id` according to the masking schedule.
|
||||
"""
|
||||
def add_noise(self, sample, timesteps, generator=None):
|
||||
step_idx = (self.timesteps == timesteps).nonzero()
|
||||
ratio = (step_idx + 1) / len(self.timesteps)
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, Optional, Tuple, Union
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
|
||||
@@ -12,10 +12,10 @@ from .scheduling_utils import SchedulerMixin
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -23,17 +23,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
|
||||
@@ -121,7 +121,7 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -287,23 +287,7 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
|
||||
return c_skip, c_out
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -318,14 +302,7 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
def _init_step_index(self, timestep):
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -433,21 +410,6 @@ class CMStochasticIterativeScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -137,7 +137,7 @@ class CosineDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -266,19 +266,6 @@ class CosineDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -550,21 +537,6 @@ class CosineDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -49,10 +49,10 @@ class DDIMSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -60,17 +60,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -93,17 +92,17 @@ def betas_for_alpha_bar(
|
||||
return torch.tensor(betas, dtype=torch.float32)
|
||||
|
||||
|
||||
def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
|
||||
def rescale_zero_terminal_snr(betas):
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
The betas that the scheduler is being initialized with.
|
||||
the betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -144,9 +143,9 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
The starting `beta` value of inference.
|
||||
beta_end (`float`, defaults to 0.02):
|
||||
The final `beta` value.
|
||||
beta_schedule (`Literal["linear", "scaled_linear", "squaredcos_cap_v2"]`, defaults to `"linear"`):
|
||||
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Must be one
|
||||
of `"linear"`, `"scaled_linear"`, or `"squaredcos_cap_v2"`.
|
||||
beta_schedule (`str`, defaults to `"linear"`):
|
||||
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
|
||||
`linear`, `scaled_linear`, or `squaredcos_cap_v2`.
|
||||
trained_betas (`np.ndarray`, *optional*):
|
||||
Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
|
||||
clip_sample (`bool`, defaults to `True`):
|
||||
@@ -159,10 +158,10 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
otherwise it uses the alpha value at step 0.
|
||||
steps_offset (`int`, defaults to 0):
|
||||
An offset added to the inference steps, as required by some model families.
|
||||
prediction_type (`Literal["epsilon", "sample", "v_prediction"]`, defaults to `"epsilon"`):
|
||||
Prediction type of the scheduler function. Must be one of `"epsilon"` (predicts the noise of the diffusion
|
||||
process), `"sample"` (directly predicts the noisy sample), or `"v_prediction"` (see section 2.4 of [Imagen
|
||||
Video](https://huggingface.co/papers/2210.02303) paper).
|
||||
prediction_type (`str`, defaults to `epsilon`, *optional*):
|
||||
Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
|
||||
`sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
|
||||
Video](https://imagen.research.google/video/paper.pdf) paper).
|
||||
thresholding (`bool`, defaults to `False`):
|
||||
Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
|
||||
as Stable Diffusion.
|
||||
@@ -170,10 +169,9 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
|
||||
sample_max_value (`float`, defaults to 1.0):
|
||||
The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
|
||||
timestep_spacing (`Literal["leading", "trailing", "linspace"]`, defaults to `"leading"`):
|
||||
The way the timesteps should be scaled. Must be one of `"leading"`, `"trailing"`, or `"linspace"`. Refer to
|
||||
Table 2 of the [Common Diffusion Noise Schedules and Sample Steps are
|
||||
Flawed](https://huggingface.co/papers/2305.08891) for more information.
|
||||
timestep_spacing (`str`, defaults to `"leading"`):
|
||||
The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
|
||||
Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
|
||||
rescale_betas_zero_snr (`bool`, defaults to `False`):
|
||||
Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
|
||||
dark samples instead of limiting it to samples with medium brightness. Loosely related to
|
||||
@@ -189,17 +187,17 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
num_train_timesteps: int = 1000,
|
||||
beta_start: float = 0.0001,
|
||||
beta_end: float = 0.02,
|
||||
beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
|
||||
beta_schedule: str = "linear",
|
||||
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
||||
clip_sample: bool = True,
|
||||
set_alpha_to_one: bool = True,
|
||||
steps_offset: int = 0,
|
||||
prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
|
||||
prediction_type: str = "epsilon",
|
||||
thresholding: bool = False,
|
||||
dynamic_thresholding_ratio: float = 0.995,
|
||||
clip_sample_range: float = 1.0,
|
||||
sample_max_value: float = 1.0,
|
||||
timestep_spacing: Literal["leading", "trailing", "linspace"] = "leading",
|
||||
timestep_spacing: str = "leading",
|
||||
rescale_betas_zero_snr: bool = False,
|
||||
):
|
||||
if trained_betas is not None:
|
||||
@@ -252,25 +250,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
"""
|
||||
return sample
|
||||
|
||||
def _get_variance(self, timestep: int, prev_timestep: int) -> torch.Tensor:
|
||||
"""
|
||||
Computes the variance of the noise added at a given diffusion step.
|
||||
|
||||
For a given `timestep` and its previous step, this method calculates the variance as defined in DDIM/DDPM
|
||||
literature:
|
||||
var_t = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
|
||||
where alpha_prod and beta_prod are cumulative products of alphas and betas, respectively.
|
||||
|
||||
Args:
|
||||
timestep (`int`):
|
||||
The current timestep in the diffusion process.
|
||||
prev_timestep (`int`):
|
||||
The previous timestep in the diffusion process. If negative, uses `final_alpha_cumprod`.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The variance for the current timestep.
|
||||
"""
|
||||
def _get_variance(self, timestep, prev_timestep):
|
||||
alpha_prod_t = self.alphas_cumprod[timestep]
|
||||
alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
|
||||
beta_prod_t = 1 - alpha_prod_t
|
||||
@@ -283,8 +263,6 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -292,14 +270,6 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -324,18 +294,13 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
return sample
|
||||
|
||||
def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None) -> None:
|
||||
def set_timesteps(self, num_inference_steps: int, device: Union[str, torch.device] = None):
|
||||
"""
|
||||
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
|
||||
|
||||
Args:
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model.
|
||||
device (`Union[str, torch.device]`, *optional*):
|
||||
The device to use for the timesteps.
|
||||
|
||||
Raises:
|
||||
ValueError: If `num_inference_steps` is larger than `self.config.num_train_timesteps`.
|
||||
"""
|
||||
|
||||
if num_inference_steps > self.config.num_train_timesteps:
|
||||
@@ -381,7 +346,7 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
sample: torch.Tensor,
|
||||
eta: float = 0.0,
|
||||
use_clipped_model_output: bool = False,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
generator=None,
|
||||
variance_noise: Optional[torch.Tensor] = None,
|
||||
return_dict: bool = True,
|
||||
) -> Union[DDIMSchedulerOutput, Tuple]:
|
||||
@@ -392,21 +357,20 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
Args:
|
||||
model_output (`torch.Tensor`):
|
||||
The direct output from learned diffusion model.
|
||||
timestep (`int`):
|
||||
timestep (`float`):
|
||||
The current discrete timestep in the diffusion chain.
|
||||
sample (`torch.Tensor`):
|
||||
A current instance of a sample created by the diffusion process.
|
||||
eta (`float`, *optional*, defaults to 0.0):
|
||||
The weight of noise for added noise in diffusion step. A value of 0 corresponds to DDIM (deterministic)
|
||||
and 1 corresponds to DDPM (fully stochastic).
|
||||
use_clipped_model_output (`bool`, *optional*, defaults to `False`):
|
||||
eta (`float`):
|
||||
The weight of noise for added noise in diffusion step.
|
||||
use_clipped_model_output (`bool`, defaults to `False`):
|
||||
If `True`, computes "corrected" `model_output` from the clipped predicted original sample. Necessary
|
||||
because predicted original sample is clipped to [-1, 1] when `self.config.clip_sample` is `True`. If no
|
||||
clipping has happened, "corrected" `model_output` would coincide with the one provided as input and
|
||||
`use_clipped_model_output` has no effect.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator for reproducible sampling.
|
||||
variance_noise (`torch.Tensor`, *optional*):
|
||||
A random number generator.
|
||||
variance_noise (`torch.Tensor`):
|
||||
Alternative to generating noise with `generator` by directly providing the noise for the variance
|
||||
itself. Useful for methods such as [`CycleDiffusion`].
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
@@ -513,22 +477,6 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
@@ -551,21 +499,6 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction from the sample and noise according to the velocity formula.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps for velocity computation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed velocity.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as sample
|
||||
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
|
||||
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
|
||||
@@ -584,5 +517,5 @@ class DDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
|
||||
return velocity
|
||||
|
||||
def __len__(self) -> int:
|
||||
def __len__(self):
|
||||
return self.config.num_train_timesteps
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -49,10 +49,10 @@ class DDIMSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -60,17 +60,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -409,22 +408,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
@@ -447,21 +430,6 @@ class CogVideoXDDIMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction from the sample and noise according to the velocity formula.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps for velocity computation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed velocity.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as sample
|
||||
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
|
||||
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
# and https://github.com/hojonathanho/diffusion
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -47,10 +47,10 @@ class DDIMSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -58,17 +58,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -96,13 +95,13 @@ def rescale_zero_terminal_snr(betas):
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
The betas that the scheduler is being initialized with.
|
||||
the betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -49,10 +49,10 @@ class DDIMParallelSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -60,17 +60,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -98,13 +97,13 @@ def rescale_zero_terminal_snr(betas):
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
The betas that the scheduler is being initialized with.
|
||||
the betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -195,17 +194,17 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
num_train_timesteps: int = 1000,
|
||||
beta_start: float = 0.0001,
|
||||
beta_end: float = 0.02,
|
||||
beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
|
||||
beta_schedule: str = "linear",
|
||||
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
||||
clip_sample: bool = True,
|
||||
set_alpha_to_one: bool = True,
|
||||
steps_offset: int = 0,
|
||||
prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
|
||||
prediction_type: str = "epsilon",
|
||||
thresholding: bool = False,
|
||||
dynamic_thresholding_ratio: float = 0.995,
|
||||
clip_sample_range: float = 1.0,
|
||||
sample_max_value: float = 1.0,
|
||||
timestep_spacing: Literal["leading", "trailing", "linspace"] = "leading",
|
||||
timestep_spacing: str = "leading",
|
||||
rescale_betas_zero_snr: bool = False,
|
||||
):
|
||||
if trained_betas is not None:
|
||||
@@ -286,8 +285,6 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -295,14 +292,6 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -335,11 +324,6 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
Args:
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model.
|
||||
device (`Union[str, torch.device]`, *optional*):
|
||||
The device to use for the timesteps.
|
||||
|
||||
Raises:
|
||||
ValueError: If `num_inference_steps` is larger than `self.config.num_train_timesteps`.
|
||||
"""
|
||||
|
||||
if num_inference_steps > self.config.num_train_timesteps:
|
||||
@@ -618,22 +602,6 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
@@ -656,21 +624,6 @@ class DDIMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction from the sample and noise according to the velocity formula.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps for velocity computation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed velocity.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as sample
|
||||
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
|
||||
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -46,10 +46,10 @@ class DDPMSchedulerOutput(BaseOutput):
|
||||
|
||||
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -57,17 +57,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -91,17 +90,17 @@ def betas_for_alpha_bar(
|
||||
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
|
||||
def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
|
||||
def rescale_zero_terminal_snr(betas):
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
The betas that the scheduler is being initialized with.
|
||||
the betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -135,37 +134,39 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
methods the library implements for all schedulers such as loading and saving.
|
||||
|
||||
Args:
|
||||
num_train_timesteps (`int`, defaults to `1000`):
|
||||
num_train_timesteps (`int`, defaults to 1000):
|
||||
The number of diffusion steps to train the model.
|
||||
beta_start (`float`, defaults to `0.0001`):
|
||||
beta_start (`float`, defaults to 0.0001):
|
||||
The starting `beta` value of inference.
|
||||
beta_end (`float`, defaults to `0.02`):
|
||||
beta_end (`float`, defaults to 0.02):
|
||||
The final `beta` value.
|
||||
beta_schedule (`"linear"`, `"scaled_linear"`, `"squaredcos_cap_v2"`, or `"sigmoid"`, defaults to `"linear"`):
|
||||
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model.
|
||||
beta_schedule (`str`, defaults to `"linear"`):
|
||||
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
|
||||
`linear`, `scaled_linear`, `squaredcos_cap_v2`, or `sigmoid`.
|
||||
trained_betas (`np.ndarray`, *optional*):
|
||||
An array of betas to pass directly to the constructor without using `beta_start` and `beta_end`.
|
||||
variance_type (`"fixed_small"`, `"fixed_small_log"`, `"fixed_large"`, `"fixed_large_log"`, `"learned"`, or `"learned_range"`, defaults to `"fixed_small"`):
|
||||
Clip the variance when adding noise to the denoised sample.
|
||||
variance_type (`str`, defaults to `"fixed_small"`):
|
||||
Clip the variance when adding noise to the denoised sample. Choose from `fixed_small`, `fixed_small_log`,
|
||||
`fixed_large`, `fixed_large_log`, `learned` or `learned_range`.
|
||||
clip_sample (`bool`, defaults to `True`):
|
||||
Clip the predicted sample for numerical stability.
|
||||
clip_sample_range (`float`, defaults to `1.0`):
|
||||
clip_sample_range (`float`, defaults to 1.0):
|
||||
The maximum magnitude for sample clipping. Valid only when `clip_sample=True`.
|
||||
prediction_type (`"epsilon"`, `"sample"`, or `"v_prediction"`, defaults to `"epsilon"`):
|
||||
prediction_type (`str`, defaults to `epsilon`, *optional*):
|
||||
Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
|
||||
`sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
|
||||
Video](https://imagen.research.google/video/paper.pdf) paper).
|
||||
thresholding (`bool`, defaults to `False`):
|
||||
Whether to use the "dynamic thresholding" method. This is unsuitable for latent-space diffusion models such
|
||||
as Stable Diffusion.
|
||||
dynamic_thresholding_ratio (`float`, defaults to `0.995`):
|
||||
dynamic_thresholding_ratio (`float`, defaults to 0.995):
|
||||
The ratio for the dynamic thresholding method. Valid only when `thresholding=True`.
|
||||
sample_max_value (`float`, defaults to `1.0`):
|
||||
sample_max_value (`float`, defaults to 1.0):
|
||||
The threshold value for dynamic thresholding. Valid only when `thresholding=True`.
|
||||
timestep_spacing (`"linspace"`, `"leading"`, or `"trailing"`, defaults to `"leading"`):
|
||||
timestep_spacing (`str`, defaults to `"leading"`):
|
||||
The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
|
||||
Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
|
||||
steps_offset (`int`, defaults to `0`):
|
||||
steps_offset (`int`, defaults to 0):
|
||||
An offset added to the inference steps, as required by some model families.
|
||||
rescale_betas_zero_snr (`bool`, defaults to `False`):
|
||||
Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
|
||||
@@ -182,18 +183,16 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
num_train_timesteps: int = 1000,
|
||||
beta_start: float = 0.0001,
|
||||
beta_end: float = 0.02,
|
||||
beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2", "sigmoid"] = "linear",
|
||||
beta_schedule: str = "linear",
|
||||
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
||||
variance_type: Literal[
|
||||
"fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"
|
||||
] = "fixed_small",
|
||||
variance_type: str = "fixed_small",
|
||||
clip_sample: bool = True,
|
||||
prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
|
||||
prediction_type: str = "epsilon",
|
||||
thresholding: bool = False,
|
||||
dynamic_thresholding_ratio: float = 0.995,
|
||||
clip_sample_range: float = 1.0,
|
||||
sample_max_value: float = 1.0,
|
||||
timestep_spacing: Literal["linspace", "leading", "trailing"] = "leading",
|
||||
timestep_spacing: str = "leading",
|
||||
steps_offset: int = 0,
|
||||
rescale_betas_zero_snr: bool = False,
|
||||
):
|
||||
@@ -323,31 +322,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
self.timesteps = torch.from_numpy(timesteps).to(device)
|
||||
|
||||
def _get_variance(
|
||||
self,
|
||||
t: int,
|
||||
predicted_variance: Optional[torch.Tensor] = None,
|
||||
variance_type: Optional[
|
||||
Literal["fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"]
|
||||
] = None,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Compute the variance for a given timestep according to the specified variance type.
|
||||
|
||||
Args:
|
||||
t (`int`):
|
||||
The current timestep.
|
||||
predicted_variance (`torch.Tensor`, *optional*):
|
||||
The predicted variance from the model. Used only when `variance_type` is `"learned"` or
|
||||
`"learned_range"`.
|
||||
variance_type (`"fixed_small"`, `"fixed_small_log"`, `"fixed_large"`, `"fixed_large_log"`, `"learned"`, or `"learned_range"`, *optional*):
|
||||
The type of variance to compute. If `None`, uses the variance type specified in the scheduler
|
||||
configuration.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed variance.
|
||||
"""
|
||||
def _get_variance(self, t, predicted_variance=None, variance_type=None):
|
||||
prev_t = self.previous_timestep(t)
|
||||
|
||||
alpha_prod_t = self.alphas_cumprod[t]
|
||||
@@ -389,8 +364,6 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -398,14 +371,6 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -435,7 +400,7 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
model_output: torch.Tensor,
|
||||
timestep: int,
|
||||
sample: torch.Tensor,
|
||||
generator: Optional[torch.Generator] = None,
|
||||
generator=None,
|
||||
return_dict: bool = True,
|
||||
) -> Union[DDPMSchedulerOutput, Tuple]:
|
||||
"""
|
||||
@@ -445,19 +410,20 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
Args:
|
||||
model_output (`torch.Tensor`):
|
||||
The direct output from learned diffusion model.
|
||||
timestep (`int`):
|
||||
timestep (`float`):
|
||||
The current discrete timestep in the diffusion chain.
|
||||
sample (`torch.Tensor`):
|
||||
A current instance of a sample created by the diffusion process.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator.
|
||||
return_dict (`bool`, defaults to `True`):
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to return a [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`.
|
||||
|
||||
Returns:
|
||||
[`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] or `tuple`:
|
||||
If return_dict is `True`, [`~schedulers.scheduling_ddpm.DDPMSchedulerOutput`] is returned, otherwise a
|
||||
tuple is returned where the first element is the sample tensor.
|
||||
|
||||
"""
|
||||
t = timestep
|
||||
|
||||
@@ -538,22 +504,6 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
@@ -575,21 +525,6 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
return noisy_samples
|
||||
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction from the sample and noise according to the velocity formula.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps for velocity computation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed velocity.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as sample
|
||||
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
|
||||
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
|
||||
@@ -608,21 +543,10 @@ class DDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
|
||||
return velocity
|
||||
|
||||
def __len__(self) -> int:
|
||||
def __len__(self):
|
||||
return self.config.num_train_timesteps
|
||||
|
||||
def previous_timestep(self, timestep: int) -> int:
|
||||
"""
|
||||
Compute the previous timestep in the diffusion chain.
|
||||
|
||||
Args:
|
||||
timestep (`int`):
|
||||
The current timestep.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The previous timestep.
|
||||
"""
|
||||
def previous_timestep(self, timestep):
|
||||
if self.custom_timesteps or self.num_inference_steps:
|
||||
index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
|
||||
if index == self.timesteps.shape[0] - 1:
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -48,10 +48,10 @@ class DDPMParallelSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -59,17 +59,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -97,13 +96,13 @@ def rescale_zero_terminal_snr(betas):
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
The betas that the scheduler is being initialized with.
|
||||
the betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -192,18 +191,16 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
num_train_timesteps: int = 1000,
|
||||
beta_start: float = 0.0001,
|
||||
beta_end: float = 0.02,
|
||||
beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2", "sigmoid"] = "linear",
|
||||
beta_schedule: str = "linear",
|
||||
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
||||
variance_type: Literal[
|
||||
"fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"
|
||||
] = "fixed_small",
|
||||
variance_type: str = "fixed_small",
|
||||
clip_sample: bool = True,
|
||||
prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
|
||||
prediction_type: str = "epsilon",
|
||||
thresholding: bool = False,
|
||||
dynamic_thresholding_ratio: float = 0.995,
|
||||
clip_sample_range: float = 1.0,
|
||||
sample_max_value: float = 1.0,
|
||||
timestep_spacing: Literal["linspace", "leading", "trailing"] = "leading",
|
||||
timestep_spacing: str = "leading",
|
||||
steps_offset: int = 0,
|
||||
rescale_betas_zero_snr: bool = False,
|
||||
):
|
||||
@@ -336,31 +333,7 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
self.timesteps = torch.from_numpy(timesteps).to(device)
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._get_variance
|
||||
def _get_variance(
|
||||
self,
|
||||
t: int,
|
||||
predicted_variance: Optional[torch.Tensor] = None,
|
||||
variance_type: Optional[
|
||||
Literal["fixed_small", "fixed_small_log", "fixed_large", "fixed_large_log", "learned", "learned_range"]
|
||||
] = None,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Compute the variance for a given timestep according to the specified variance type.
|
||||
|
||||
Args:
|
||||
t (`int`):
|
||||
The current timestep.
|
||||
predicted_variance (`torch.Tensor`, *optional*):
|
||||
The predicted variance from the model. Used only when `variance_type` is `"learned"` or
|
||||
`"learned_range"`.
|
||||
variance_type (`"fixed_small"`, `"fixed_small_log"`, `"fixed_large"`, `"fixed_large_log"`, `"learned"`, or `"learned_range"`, *optional*):
|
||||
The type of variance to compute. If `None`, uses the variance type specified in the scheduler
|
||||
configuration.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed variance.
|
||||
"""
|
||||
def _get_variance(self, t, predicted_variance=None, variance_type=None):
|
||||
prev_t = self.previous_timestep(t)
|
||||
|
||||
alpha_prod_t = self.alphas_cumprod[t]
|
||||
@@ -403,8 +376,6 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -412,14 +383,6 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -630,22 +593,6 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
@@ -668,21 +615,6 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction from the sample and noise according to the velocity formula.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps for velocity computation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed velocity.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as sample
|
||||
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
|
||||
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
|
||||
@@ -706,17 +638,6 @@ class DDPMParallelScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
|
||||
def previous_timestep(self, timestep):
|
||||
"""
|
||||
Compute the previous timestep in the diffusion chain.
|
||||
|
||||
Args:
|
||||
timestep (`int`):
|
||||
The current timestep.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The previous timestep.
|
||||
"""
|
||||
if self.custom_timesteps or self.num_inference_steps:
|
||||
index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
|
||||
if index == self.timesteps.shape[0] - 1:
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
|
||||
|
||||
import math
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -32,10 +32,10 @@ if is_scipy_available():
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -43,17 +43,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -230,7 +229,7 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -321,8 +320,6 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -330,14 +327,6 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -364,19 +353,6 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -412,20 +388,7 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -451,19 +414,7 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
"""Constructs an exponential noise schedule."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -487,24 +438,7 @@ class DEISMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -50,10 +50,10 @@ class DDIMSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -61,17 +61,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -446,22 +445,6 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
@@ -484,21 +467,6 @@ class CogVideoXDPMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction from the sample and noise according to the velocity formula.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps for velocity computation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed velocity.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as sample
|
||||
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
|
||||
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
|
||||
|
||||
import math
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -32,10 +32,10 @@ if is_scipy_available():
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -43,17 +43,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -81,13 +80,13 @@ def rescale_zero_terminal_snr(betas):
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
The betas that the scheduler is being initialized with.
|
||||
the betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -324,7 +323,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -461,8 +460,6 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -470,14 +467,6 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -504,19 +493,6 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -551,20 +527,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -603,19 +566,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
"""Constructs an exponential noise schedule."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -639,24 +590,7 @@ class DPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
|
||||
|
||||
import math
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -32,10 +32,10 @@ if is_scipy_available():
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -43,17 +43,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -333,8 +332,6 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -342,14 +339,6 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -376,19 +365,6 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -424,20 +400,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -463,19 +426,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
"""Constructs an exponential noise schedule."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -499,24 +450,7 @@ class DPMSolverMultistepInverseScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -115,10 +115,10 @@ class BrownianTreeNoiseSampler:
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -126,17 +126,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -251,23 +250,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
||||
self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -282,14 +265,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
def _init_step_index(self, timestep):
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -325,7 +301,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -453,19 +429,6 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -504,19 +467,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
"""Constructs an exponential noise schedule."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -540,24 +491,7 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -711,21 +645,6 @@ class DPMSolverSDEScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
# DISCLAIMER: This file is strongly influenced by https://github.com/LuChengTHU/dpm-solver
|
||||
|
||||
import math
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -34,10 +34,10 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -45,17 +45,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -295,7 +294,7 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -411,8 +410,6 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -420,14 +417,6 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -454,19 +443,6 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -502,20 +478,7 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -541,19 +504,7 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
"""Constructs an exponential noise schedule."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -577,24 +528,7 @@ class DPMSolverSinglestepScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
|
||||
@@ -169,7 +169,7 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -299,8 +299,6 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -308,14 +306,6 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -342,19 +332,6 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -695,21 +672,6 @@ class EDMDPMSolverMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -155,7 +155,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -284,23 +284,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
|
||||
return sigmas
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -315,14 +299,7 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
def _init_step_index(self, timestep):
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -436,21 +413,6 @@ class EDMEulerScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -49,10 +49,10 @@ class EulerAncestralDiscreteSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -60,17 +60,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -98,13 +97,13 @@ def rescale_zero_terminal_snr(betas):
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
The betas that the scheduler is being initialized with.
|
||||
the betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -246,7 +245,7 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -320,23 +319,7 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -351,14 +334,7 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
def _init_step_index(self, timestep):
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -475,21 +451,6 @@ class EulerAncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -52,10 +52,10 @@ class EulerDiscreteSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -63,17 +63,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -97,17 +96,17 @@ def betas_for_alpha_bar(
|
||||
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddim.rescale_zero_terminal_snr
|
||||
def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
|
||||
def rescale_zero_terminal_snr(betas):
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
The betas that the scheduler is being initialized with.
|
||||
the betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -147,17 +146,17 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
The starting `beta` value of inference.
|
||||
beta_end (`float`, defaults to 0.02):
|
||||
The final `beta` value.
|
||||
beta_schedule (`Literal["linear", "scaled_linear", "squaredcos_cap_v2"]`, defaults to `"linear"`):
|
||||
beta_schedule (`str`, defaults to `"linear"`):
|
||||
The beta schedule, a mapping from a beta range to a sequence of betas for stepping the model. Choose from
|
||||
`"linear"`, `"scaled_linear"`, or `"squaredcos_cap_v2"`.
|
||||
`linear` or `scaled_linear`.
|
||||
trained_betas (`np.ndarray`, *optional*):
|
||||
Pass an array of betas directly to the constructor to bypass `beta_start` and `beta_end`.
|
||||
prediction_type (`Literal["epsilon", "sample", "v_prediction"]`, defaults to `"epsilon"`, *optional*):
|
||||
Prediction type of the scheduler function; can be `"epsilon"` (predicts the noise of the diffusion
|
||||
process), `"sample"` (directly predicts the noisy sample`) or `"v_prediction"` (see section 2.4 of [Imagen
|
||||
prediction_type (`str`, defaults to `epsilon`, *optional*):
|
||||
Prediction type of the scheduler function; can be `epsilon` (predicts the noise of the diffusion process),
|
||||
`sample` (directly predicts the noisy sample`) or `v_prediction` (see section 2.4 of [Imagen
|
||||
Video](https://imagen.research.google/video/paper.pdf) paper).
|
||||
interpolation_type (`Literal["linear", "log_linear"]`, defaults to `"linear"`, *optional*):
|
||||
The interpolation type to compute intermediate sigmas for the scheduler denoising steps. Should be one of
|
||||
interpolation_type(`str`, defaults to `"linear"`, *optional*):
|
||||
The interpolation type to compute intermediate sigmas for the scheduler denoising steps. Should be on of
|
||||
`"linear"` or `"log_linear"`.
|
||||
use_karras_sigmas (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use Karras sigmas for step sizes in the noise schedule during the sampling process. If `True`,
|
||||
@@ -167,26 +166,18 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
use_beta_sigmas (`bool`, *optional*, defaults to `False`):
|
||||
Whether to use beta sigmas for step sizes in the noise schedule during the sampling process. Refer to [Beta
|
||||
Sampling is All You Need](https://huggingface.co/papers/2407.12173) for more information.
|
||||
sigma_min (`float`, *optional*):
|
||||
The minimum sigma value for the noise schedule. If not provided, defaults to the last sigma in the
|
||||
schedule.
|
||||
sigma_max (`float`, *optional*):
|
||||
The maximum sigma value for the noise schedule. If not provided, defaults to the first sigma in the
|
||||
schedule.
|
||||
timestep_spacing (`Literal["linspace", "leading", "trailing"]`, defaults to `"linspace"`):
|
||||
timestep_spacing (`str`, defaults to `"linspace"`):
|
||||
The way the timesteps should be scaled. Refer to Table 2 of the [Common Diffusion Noise Schedules and
|
||||
Sample Steps are Flawed](https://huggingface.co/papers/2305.08891) for more information.
|
||||
timestep_type (`Literal["discrete", "continuous"]`, defaults to `"discrete"`):
|
||||
The type of timesteps to use. Can be `"discrete"` or `"continuous"`.
|
||||
steps_offset (`int`, defaults to 0):
|
||||
An offset added to the inference steps, as required by some model families.
|
||||
rescale_betas_zero_snr (`bool`, defaults to `False`):
|
||||
Whether to rescale the betas to have zero terminal SNR. This enables the model to generate very bright and
|
||||
dark samples instead of limiting it to samples with medium brightness. Loosely related to
|
||||
[`--offset_noise`](https://github.com/huggingface/diffusers/blob/74fd735eb073eb1d774b1ab4154a0876eb82f055/examples/dreambooth/train_dreambooth.py#L506).
|
||||
final_sigmas_type (`Literal["zero", "sigma_min"]`, defaults to `"zero"`):
|
||||
final_sigmas_type (`str`, defaults to `"zero"`):
|
||||
The final `sigma` value for the noise schedule during the sampling process. If `"sigma_min"`, the final
|
||||
sigma is the same as the last sigma in the training schedule. If `"zero"`, the final sigma is set to 0.
|
||||
sigma is the same as the last sigma in the training schedule. If `zero`, the final sigma is set to 0.
|
||||
"""
|
||||
|
||||
_compatibles = [e.name for e in KarrasDiffusionSchedulers]
|
||||
@@ -198,20 +189,20 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
num_train_timesteps: int = 1000,
|
||||
beta_start: float = 0.0001,
|
||||
beta_end: float = 0.02,
|
||||
beta_schedule: Literal["linear", "scaled_linear", "squaredcos_cap_v2"] = "linear",
|
||||
beta_schedule: str = "linear",
|
||||
trained_betas: Optional[Union[np.ndarray, List[float]]] = None,
|
||||
prediction_type: Literal["epsilon", "sample", "v_prediction"] = "epsilon",
|
||||
interpolation_type: Literal["linear", "log_linear"] = "linear",
|
||||
prediction_type: str = "epsilon",
|
||||
interpolation_type: str = "linear",
|
||||
use_karras_sigmas: Optional[bool] = False,
|
||||
use_exponential_sigmas: Optional[bool] = False,
|
||||
use_beta_sigmas: Optional[bool] = False,
|
||||
sigma_min: Optional[float] = None,
|
||||
sigma_max: Optional[float] = None,
|
||||
timestep_spacing: Literal["linspace", "leading", "trailing"] = "linspace",
|
||||
timestep_type: Literal["discrete", "continuous"] = "discrete",
|
||||
timestep_spacing: str = "linspace",
|
||||
timestep_type: str = "discrete", # can be "discrete" or "continuous"
|
||||
steps_offset: int = 0,
|
||||
rescale_betas_zero_snr: bool = False,
|
||||
final_sigmas_type: Literal["zero", "sigma_min"] = "zero",
|
||||
final_sigmas_type: str = "zero", # can be "zero" or "sigma_min"
|
||||
):
|
||||
if self.config.use_beta_sigmas and not is_scipy_available():
|
||||
raise ImportError("Make sure to install scipy if you want to use beta sigmas.")
|
||||
@@ -268,15 +259,8 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication
|
||||
|
||||
@property
|
||||
def init_noise_sigma(self) -> Union[float, torch.Tensor]:
|
||||
"""
|
||||
The standard deviation of the initial noise distribution.
|
||||
|
||||
Returns:
|
||||
`float` or `torch.Tensor`:
|
||||
The standard deviation of the initial noise distribution, computed based on the maximum sigma value and
|
||||
the timestep spacing configuration.
|
||||
"""
|
||||
def init_noise_sigma(self):
|
||||
# standard deviation of the initial noise distribution
|
||||
max_sigma = max(self.sigmas) if isinstance(self.sigmas, list) else self.sigmas.max()
|
||||
if self.config.timestep_spacing in ["linspace", "trailing"]:
|
||||
return max_sigma
|
||||
@@ -284,34 +268,26 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return (max_sigma**2 + 1) ** 0.5
|
||||
|
||||
@property
|
||||
def step_index(self) -> Optional[int]:
|
||||
def step_index(self):
|
||||
"""
|
||||
The index counter for current timestep. It will increase by 1 after each scheduler step.
|
||||
|
||||
Returns:
|
||||
`int` or `None`:
|
||||
The current step index, or `None` if not initialized.
|
||||
The index counter for current timestep. It will increase 1 after each scheduler step.
|
||||
"""
|
||||
return self._step_index
|
||||
|
||||
@property
|
||||
def begin_index(self) -> Optional[int]:
|
||||
def begin_index(self):
|
||||
"""
|
||||
The index for the first timestep. It should be set from pipeline with `set_begin_index` method.
|
||||
|
||||
Returns:
|
||||
`int` or `None`:
|
||||
The begin index for the scheduler, or `None` if not set.
|
||||
"""
|
||||
return self._begin_index
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_dpmsolver_multistep.DPMSolverMultistepScheduler.set_begin_index
|
||||
def set_begin_index(self, begin_index: int = 0) -> None:
|
||||
def set_begin_index(self, begin_index: int = 0):
|
||||
"""
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -323,13 +299,13 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample to be scaled.
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The input sample.
|
||||
timestep (`int`, *optional*):
|
||||
The current timestep in the diffusion chain.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
A scaled input sample, divided by `(sigma**2 + 1) ** 0.5`.
|
||||
A scaled input sample.
|
||||
"""
|
||||
if self.step_index is None:
|
||||
self._init_step_index(timestep)
|
||||
@@ -342,18 +318,17 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
def set_timesteps(
|
||||
self,
|
||||
num_inference_steps: Optional[int] = None,
|
||||
device: Optional[Union[str, torch.device]] = None,
|
||||
num_inference_steps: int = None,
|
||||
device: Union[str, torch.device] = None,
|
||||
timesteps: Optional[List[int]] = None,
|
||||
sigmas: Optional[List[float]] = None,
|
||||
) -> None:
|
||||
):
|
||||
"""
|
||||
Sets the discrete timesteps used for the diffusion chain (to be run before inference).
|
||||
|
||||
Args:
|
||||
num_inference_steps (`int`, *optional*):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model. If `None`,
|
||||
`timesteps` or `sigmas` must be provided.
|
||||
num_inference_steps (`int`):
|
||||
The number of diffusion steps used when generating samples with a pre-trained model.
|
||||
device (`str` or `torch.device`, *optional*):
|
||||
The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
|
||||
timesteps (`List[int]`, *optional*):
|
||||
@@ -361,9 +336,10 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
based on the `timestep_spacing` attribute. If `timesteps` is passed, `num_inference_steps` and `sigmas`
|
||||
must be `None`, and `timestep_spacing` attribute will be ignored.
|
||||
sigmas (`List[float]`, *optional*):
|
||||
Custom sigmas used to support arbitrary timesteps schedule. If `None`, timesteps and sigmas will be
|
||||
generated based on the relevant scheduler attributes. If `sigmas` is passed, `num_inference_steps` and
|
||||
`timesteps` must be `None`, and the timesteps will be generated based on the custom sigmas schedule.
|
||||
Custom sigmas used to support arbitrary timesteps schedule schedule. If `None`, timesteps and sigmas
|
||||
will be generated based on the relevant scheduler attributes. If `sigmas` is passed,
|
||||
`num_inference_steps` and `timesteps` must be `None`, and the timesteps will be generated based on the
|
||||
custom sigmas schedule.
|
||||
"""
|
||||
|
||||
if timesteps is not None and sigmas is not None:
|
||||
@@ -473,20 +449,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
self._begin_index = None
|
||||
self.sigmas = sigmas.to("cpu") # to avoid too much CPU/GPU communication
|
||||
|
||||
def _sigma_to_t(self, sigma: np.ndarray, log_sigmas: np.ndarray) -> np.ndarray:
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -510,21 +473,8 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return t
|
||||
|
||||
# Copied from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L17
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -550,19 +500,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from https://github.com/crowsonkb/k-diffusion/blob/686dbad0f39640ea25c8a8c6a6e56bb40eacefa2/k_diffusion/sampling.py#L26
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
"""Constructs an exponential noise schedule."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -585,24 +523,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -630,23 +551,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
)
|
||||
return sigmas
|
||||
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -660,14 +565,7 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
return indices[pos].item()
|
||||
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
def _init_step_index(self, timestep):
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -693,33 +591,26 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
Args:
|
||||
model_output (`torch.Tensor`):
|
||||
The direct output from the learned diffusion model.
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The direct output from learned diffusion model.
|
||||
timestep (`float`):
|
||||
The current discrete timestep in the diffusion chain.
|
||||
sample (`torch.Tensor`):
|
||||
A current instance of a sample created by the diffusion process.
|
||||
s_churn (`float`, *optional*, defaults to `0.0`):
|
||||
Stochasticity parameter that controls the amount of noise added during sampling. Higher values increase
|
||||
randomness.
|
||||
s_tmin (`float`, *optional*, defaults to `0.0`):
|
||||
Minimum timestep threshold for applying stochasticity. Only timesteps above this value will have noise
|
||||
added.
|
||||
s_tmax (`float`, *optional*, defaults to `inf`):
|
||||
Maximum timestep threshold for applying stochasticity. Only timesteps below this value will have noise
|
||||
added.
|
||||
s_noise (`float`, *optional*, defaults to `1.0`):
|
||||
s_churn (`float`):
|
||||
s_tmin (`float`):
|
||||
s_tmax (`float`):
|
||||
s_noise (`float`, defaults to 1.0):
|
||||
Scaling factor for noise added to the sample.
|
||||
generator (`torch.Generator`, *optional*):
|
||||
A random number generator for reproducible sampling.
|
||||
return_dict (`bool`, *optional*, defaults to `True`):
|
||||
A random number generator.
|
||||
return_dict (`bool`):
|
||||
Whether or not to return a [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or
|
||||
tuple.
|
||||
|
||||
Returns:
|
||||
[`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] or `tuple`:
|
||||
If `return_dict` is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
|
||||
returned, otherwise a tuple is returned where the first element is the sample tensor and the second
|
||||
element is the predicted original sample.
|
||||
If return_dict is `True`, [`~schedulers.scheduling_euler_discrete.EulerDiscreteSchedulerOutput`] is
|
||||
returned, otherwise a tuple is returned where the first element is the sample tensor.
|
||||
"""
|
||||
|
||||
if isinstance(timestep, (int, torch.IntTensor, torch.LongTensor)):
|
||||
@@ -798,21 +689,6 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
@@ -841,24 +717,6 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return noisy_samples
|
||||
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction for the given sample and noise at the specified timesteps.
|
||||
|
||||
This method implements the velocity prediction used in v-prediction models, which predicts a linear combination
|
||||
of the sample and noise.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample for which to compute the velocity.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor corresponding to the sample.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to compute the velocity.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The velocity prediction computed as `sqrt(alpha_prod) * noise - sqrt(1 - alpha_prod) * sample`.
|
||||
"""
|
||||
if (
|
||||
isinstance(timesteps, int)
|
||||
or isinstance(timesteps, torch.IntTensor)
|
||||
@@ -895,5 +753,5 @@ class EulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
velocity = sqrt_alpha_prod * noise - sqrt_one_minus_alpha_prod * sample
|
||||
return velocity
|
||||
|
||||
def __len__(self) -> int:
|
||||
def __len__(self):
|
||||
return self.config.num_train_timesteps
|
||||
|
||||
@@ -160,7 +160,7 @@ class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -473,20 +473,7 @@ class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -512,19 +499,7 @@ class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
"""Constructs an exponential noise schedule."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -548,24 +523,7 @@ class FlowMatchEulerDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
|
||||
@@ -102,7 +102,7 @@ class FlowMatchHeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
|
||||
@@ -168,7 +168,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -473,20 +473,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -512,19 +499,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
"""Constructs an exponential noise schedule."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -548,24 +523,7 @@ class FlowMatchLCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -49,10 +49,10 @@ class HeunDiscreteSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -60,17 +60,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -188,23 +187,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
self.sigmas = self.sigmas.to("cpu") # to avoid too much CPU/GPU communication
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -246,7 +229,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -371,19 +354,6 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -408,20 +378,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -447,19 +404,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
"""Constructs an exponential noise schedule."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -483,24 +428,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -533,14 +461,7 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return self.dt is None
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
def _init_step_index(self, timestep):
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -658,21 +579,6 @@ class HeunDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -78,7 +78,7 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -112,23 +112,7 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
|
||||
self._begin_index = None
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -143,14 +127,7 @@ class IPNDMScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
def _init_step_index(self, timestep):
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -50,10 +50,10 @@ class KDPM2AncestralDiscreteSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -61,17 +61,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -207,7 +206,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -343,19 +342,6 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -380,20 +366,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -419,19 +392,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
"""Constructs an exponential noise schedule."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -455,24 +416,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -505,23 +449,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return self.sample is None
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -536,14 +464,7 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
def _init_step_index(self, timestep):
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -665,21 +586,6 @@ class KDPM2AncestralDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -49,10 +49,10 @@ class KDPM2DiscreteSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -60,17 +60,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -207,7 +206,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -331,23 +330,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return self.sample is None
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -362,14 +345,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
def _init_step_index(self, timestep):
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -379,19 +355,6 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -416,20 +379,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -455,19 +405,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
"""Constructs an exponential noise schedule."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -491,24 +429,7 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -637,21 +558,6 @@ class KDPM2DiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -51,10 +51,10 @@ class LCMSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -62,17 +62,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -100,13 +99,13 @@ def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
The betas that the scheduler is being initialized with.
|
||||
the betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -252,23 +251,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
self._begin_index = None
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -283,14 +266,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
def _init_step_index(self, timestep):
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -315,7 +291,7 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -339,8 +315,6 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -348,14 +322,6 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -631,22 +597,6 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
@@ -669,21 +619,6 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction from the sample and noise according to the velocity formula.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps for velocity computation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed velocity.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as sample
|
||||
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
|
||||
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
|
||||
@@ -707,17 +642,6 @@ class LCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
|
||||
def previous_timestep(self, timestep):
|
||||
"""
|
||||
Compute the previous timestep in the diffusion chain.
|
||||
|
||||
Args:
|
||||
timestep (`int`):
|
||||
The current timestep.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The previous timestep.
|
||||
"""
|
||||
if self.custom_timesteps or self.num_inference_steps:
|
||||
index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
|
||||
if index == self.timesteps.shape[0] - 1:
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
import math
|
||||
import warnings
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import scipy.stats
|
||||
@@ -47,10 +47,10 @@ class LMSDiscreteSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -58,17 +58,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -210,7 +209,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -320,23 +319,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
self.derivatives = []
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -351,14 +334,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
def _init_step_index(self, timestep):
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -368,19 +344,6 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -419,19 +382,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
"""Constructs an exponential noise schedule."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -455,24 +406,7 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -587,21 +521,6 @@ class LMSDiscreteScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.Tensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise schedule at the specified timesteps.
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor to add to the original samples.
|
||||
timesteps (`torch.Tensor`):
|
||||
The timesteps at which to add noise, determining the noise level from the schedule.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples with added noise scaled according to the timestep schedule.
|
||||
"""
|
||||
# Make sure sigmas and timesteps have the same device and dtype as original_samples
|
||||
sigmas = self.sigmas.to(device=original_samples.device, dtype=original_samples.dtype)
|
||||
if original_samples.device.type == "mps" and torch.is_floating_point(timesteps):
|
||||
|
||||
@@ -15,7 +15,7 @@
|
||||
# DISCLAIMER: This file is strongly influenced by https://github.com/ermongroup/ddim
|
||||
|
||||
import math
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -26,10 +26,10 @@ from .scheduling_utils import KarrasDiffusionSchedulers, SchedulerMixin, Schedul
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -37,17 +37,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -453,22 +452,6 @@ class PNDMScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, Optional, Tuple, Union
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -45,10 +45,10 @@ class RePaintSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -56,17 +56,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
|
||||
|
||||
import math
|
||||
from typing import Callable, List, Literal, Optional, Tuple, Union
|
||||
from typing import Callable, List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -33,10 +33,10 @@ if is_scipy_available():
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -44,17 +44,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -254,7 +253,7 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -343,8 +342,6 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -352,14 +349,6 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -386,19 +375,6 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -434,20 +410,7 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -473,19 +436,7 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
"""Constructs an exponential noise schedule."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -509,24 +460,7 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -1259,22 +1193,6 @@ class SASolverScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
|
||||
@@ -109,7 +109,7 @@ class SCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -173,14 +173,7 @@ class SCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
self._begin_index = None
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
def _init_step_index(self, timestep):
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -189,23 +182,7 @@ class SCMScheduler(SchedulerMixin, ConfigMixin):
|
||||
self._step_index = self._begin_index
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
|
||||
@@ -17,7 +17,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -50,10 +50,10 @@ class TCDSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -61,17 +61,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -99,13 +98,13 @@ def rescale_zero_terminal_snr(betas: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
The betas that the scheduler is being initialized with.
|
||||
the betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -253,23 +252,7 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
self._begin_index = None
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler.index_for_timestep
|
||||
def index_for_timestep(
|
||||
self, timestep: Union[float, torch.Tensor], schedule_timesteps: Optional[torch.Tensor] = None
|
||||
) -> int:
|
||||
"""
|
||||
Find the index of a given timestep in the timestep schedule.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The timestep value to find in the schedule.
|
||||
schedule_timesteps (`torch.Tensor`, *optional*):
|
||||
The timestep schedule to search in. If `None`, uses `self.timesteps`.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The index of the timestep in the schedule. For the very first step, returns the second index if
|
||||
multiple matches exist to avoid skipping a sigma when starting mid-schedule (e.g., for image-to-image).
|
||||
"""
|
||||
def index_for_timestep(self, timestep, schedule_timesteps=None):
|
||||
if schedule_timesteps is None:
|
||||
schedule_timesteps = self.timesteps
|
||||
|
||||
@@ -284,14 +267,7 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
return indices[pos].item()
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._init_step_index
|
||||
def _init_step_index(self, timestep: Union[float, torch.Tensor]) -> None:
|
||||
"""
|
||||
Initialize the step index for the scheduler based on the given timestep.
|
||||
|
||||
Args:
|
||||
timestep (`float` or `torch.Tensor`):
|
||||
The current timestep to initialize the step index from.
|
||||
"""
|
||||
def _init_step_index(self, timestep):
|
||||
if self.begin_index is None:
|
||||
if isinstance(timestep, torch.Tensor):
|
||||
timestep = timestep.to(self.timesteps.device)
|
||||
@@ -316,7 +292,7 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -340,24 +316,6 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddim.DDIMScheduler._get_variance
|
||||
def _get_variance(self, timestep, prev_timestep):
|
||||
"""
|
||||
Computes the variance of the noise added at a given diffusion step.
|
||||
|
||||
For a given `timestep` and its previous step, this method calculates the variance as defined in DDIM/DDPM
|
||||
literature:
|
||||
var_t = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
|
||||
where alpha_prod and beta_prod are cumulative products of alphas and betas, respectively.
|
||||
|
||||
Args:
|
||||
timestep (`int`):
|
||||
The current timestep in the diffusion process.
|
||||
prev_timestep (`int`):
|
||||
The previous timestep in the diffusion process. If negative, uses `final_alpha_cumprod`.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The variance for the current timestep.
|
||||
"""
|
||||
alpha_prod_t = self.alphas_cumprod[timestep]
|
||||
alpha_prod_t_prev = self.alphas_cumprod[prev_timestep] if prev_timestep >= 0 else self.final_alpha_cumprod
|
||||
beta_prod_t = 1 - alpha_prod_t
|
||||
@@ -370,8 +328,6 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -379,14 +335,6 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -686,22 +634,6 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
@@ -724,21 +656,6 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.get_velocity
|
||||
def get_velocity(self, sample: torch.Tensor, noise: torch.Tensor, timesteps: torch.IntTensor) -> torch.Tensor:
|
||||
"""
|
||||
Compute the velocity prediction from the sample and noise according to the velocity formula.
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The input sample.
|
||||
noise (`torch.Tensor`):
|
||||
The noise tensor.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps for velocity computation.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The computed velocity.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as sample
|
||||
self.alphas_cumprod = self.alphas_cumprod.to(device=sample.device)
|
||||
alphas_cumprod = self.alphas_cumprod.to(dtype=sample.dtype)
|
||||
@@ -762,17 +679,6 @@ class TCDScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler.previous_timestep
|
||||
def previous_timestep(self, timestep):
|
||||
"""
|
||||
Compute the previous timestep in the diffusion chain.
|
||||
|
||||
Args:
|
||||
timestep (`int`):
|
||||
The current timestep.
|
||||
|
||||
Returns:
|
||||
`int`:
|
||||
The previous timestep.
|
||||
"""
|
||||
if self.custom_timesteps or self.num_inference_steps:
|
||||
index = (self.timesteps == timestep).nonzero(as_tuple=True)[0][0]
|
||||
if index == self.timesteps.shape[0] - 1:
|
||||
|
||||
@@ -14,7 +14,7 @@
|
||||
|
||||
import math
|
||||
from dataclasses import dataclass
|
||||
from typing import Literal, Optional, Tuple, Union
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -46,10 +46,10 @@ class UnCLIPSchedulerOutput(BaseOutput):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -57,17 +57,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -335,22 +334,6 @@ class UnCLIPScheduler(SchedulerMixin, ConfigMixin):
|
||||
noise: torch.Tensor,
|
||||
timesteps: torch.IntTensor,
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Add noise to the original samples according to the noise magnitude at each timestep (this is the forward
|
||||
diffusion process).
|
||||
|
||||
Args:
|
||||
original_samples (`torch.Tensor`):
|
||||
The original samples to which noise will be added.
|
||||
noise (`torch.Tensor`):
|
||||
The noise to add to the samples.
|
||||
timesteps (`torch.IntTensor`):
|
||||
The timesteps indicating the noise level for each sample.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The noisy samples.
|
||||
"""
|
||||
# Make sure alphas_cumprod and timestep have same device and dtype as original_samples
|
||||
# Move the self.alphas_cumprod to device to avoid redundant CPU to GPU data movement
|
||||
# for the subsequent add_noise calls
|
||||
|
||||
@@ -16,7 +16,7 @@
|
||||
# The codebase is modified based on https://github.com/huggingface/diffusers/blob/main/src/diffusers/schedulers/scheduling_dpmsolver_multistep.py
|
||||
|
||||
import math
|
||||
from typing import List, Literal, Optional, Tuple, Union
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
@@ -32,10 +32,10 @@ if is_scipy_available():
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.betas_for_alpha_bar
|
||||
def betas_for_alpha_bar(
|
||||
num_diffusion_timesteps: int,
|
||||
max_beta: float = 0.999,
|
||||
alpha_transform_type: Literal["cosine", "exp"] = "cosine",
|
||||
) -> torch.Tensor:
|
||||
num_diffusion_timesteps,
|
||||
max_beta=0.999,
|
||||
alpha_transform_type="cosine",
|
||||
):
|
||||
"""
|
||||
Create a beta schedule that discretizes the given alpha_t_bar function, which defines the cumulative product of
|
||||
(1-beta) over time from t = [0,1].
|
||||
@@ -43,17 +43,16 @@ def betas_for_alpha_bar(
|
||||
Contains a function alpha_bar that takes an argument t and transforms it to the cumulative product of (1-beta) up
|
||||
to that part of the diffusion process.
|
||||
|
||||
|
||||
Args:
|
||||
num_diffusion_timesteps (`int`):
|
||||
The number of betas to produce.
|
||||
max_beta (`float`, defaults to `0.999`):
|
||||
The maximum beta to use; use values lower than 1 to avoid numerical instability.
|
||||
alpha_transform_type (`"cosine"` or `"exp"`, defaults to `"cosine"`):
|
||||
The type of noise schedule for `alpha_bar`. Choose from `cosine` or `exp`.
|
||||
num_diffusion_timesteps (`int`): the number of betas to produce.
|
||||
max_beta (`float`): the maximum beta to use; use values lower than 1 to
|
||||
prevent singularities.
|
||||
alpha_transform_type (`str`, *optional*, default to `cosine`): the type of noise schedule for alpha_bar.
|
||||
Choose from `cosine` or `exp`
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The betas used by the scheduler to step the model outputs.
|
||||
betas (`np.ndarray`): the betas used by the scheduler to step the model outputs
|
||||
"""
|
||||
if alpha_transform_type == "cosine":
|
||||
|
||||
@@ -81,13 +80,13 @@ def rescale_zero_terminal_snr(betas):
|
||||
"""
|
||||
Rescales betas to have zero terminal SNR Based on https://huggingface.co/papers/2305.08891 (Algorithm 1)
|
||||
|
||||
|
||||
Args:
|
||||
betas (`torch.Tensor`):
|
||||
The betas that the scheduler is being initialized with.
|
||||
the betas that the scheduler is being initialized with.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
Rescaled betas with zero terminal SNR.
|
||||
`torch.Tensor`: rescaled betas with zero terminal SNR
|
||||
"""
|
||||
# Convert betas to alphas_bar_sqrt
|
||||
alphas = 1.0 - betas
|
||||
@@ -298,7 +297,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
Sets the begin index for the scheduler. This function should be run from pipeline before the inference.
|
||||
|
||||
Args:
|
||||
begin_index (`int`, defaults to `0`):
|
||||
begin_index (`int`):
|
||||
The begin index for the scheduler.
|
||||
"""
|
||||
self._begin_index = begin_index
|
||||
@@ -433,8 +432,6 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
# Copied from diffusers.schedulers.scheduling_ddpm.DDPMScheduler._threshold_sample
|
||||
def _threshold_sample(self, sample: torch.Tensor) -> torch.Tensor:
|
||||
"""
|
||||
Apply dynamic thresholding to the predicted sample.
|
||||
|
||||
"Dynamic thresholding: At each sampling step we set s to a certain percentile absolute pixel value in xt0 (the
|
||||
prediction of x_0 at timestep t), and if s > 1, then we threshold xt0 to the range [-s, s] and then divide by
|
||||
s. Dynamic thresholding pushes saturated pixels (those near -1 and 1) inwards, thereby actively preventing
|
||||
@@ -442,14 +439,6 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
photorealism as well as better image-text alignment, especially when using very large guidance weights."
|
||||
|
||||
https://huggingface.co/papers/2205.11487
|
||||
|
||||
Args:
|
||||
sample (`torch.Tensor`):
|
||||
The predicted sample to be thresholded.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The thresholded sample.
|
||||
"""
|
||||
dtype = sample.dtype
|
||||
batch_size, channels, *remaining_dims = sample.shape
|
||||
@@ -476,19 +465,6 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._sigma_to_t
|
||||
def _sigma_to_t(self, sigma, log_sigmas):
|
||||
"""
|
||||
Convert sigma values to corresponding timestep values through interpolation.
|
||||
|
||||
Args:
|
||||
sigma (`np.ndarray`):
|
||||
The sigma value(s) to convert to timestep(s).
|
||||
log_sigmas (`np.ndarray`):
|
||||
The logarithm of the sigma schedule used for interpolation.
|
||||
|
||||
Returns:
|
||||
`np.ndarray`:
|
||||
The interpolated timestep value(s) corresponding to the input sigma(s).
|
||||
"""
|
||||
# get log sigma
|
||||
log_sigma = np.log(np.maximum(sigma, 1e-10))
|
||||
|
||||
@@ -524,20 +500,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_karras
|
||||
def _convert_to_karras(self, in_sigmas: torch.Tensor, num_inference_steps) -> torch.Tensor:
|
||||
"""
|
||||
Construct the noise schedule as proposed in [Elucidating the Design Space of Diffusion-Based Generative
|
||||
Models](https://huggingface.co/papers/2206.00364).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following the Karras noise schedule.
|
||||
"""
|
||||
"""Constructs the noise schedule of Karras et al. (2022)."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -563,19 +526,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
|
||||
# Copied from diffusers.schedulers.scheduling_euler_discrete.EulerDiscreteScheduler._convert_to_exponential
|
||||
def _convert_to_exponential(self, in_sigmas: torch.Tensor, num_inference_steps: int) -> torch.Tensor:
|
||||
"""
|
||||
Construct an exponential noise schedule.
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following an exponential schedule.
|
||||
"""
|
||||
"""Constructs an exponential noise schedule."""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
@@ -599,24 +550,7 @@ class UniPCMultistepScheduler(SchedulerMixin, ConfigMixin):
|
||||
def _convert_to_beta(
|
||||
self, in_sigmas: torch.Tensor, num_inference_steps: int, alpha: float = 0.6, beta: float = 0.6
|
||||
) -> torch.Tensor:
|
||||
"""
|
||||
Construct a beta noise schedule as proposed in [Beta Sampling is All You
|
||||
Need](https://huggingface.co/papers/2407.12173).
|
||||
|
||||
Args:
|
||||
in_sigmas (`torch.Tensor`):
|
||||
The input sigma values to be converted.
|
||||
num_inference_steps (`int`):
|
||||
The number of inference steps to generate the noise schedule for.
|
||||
alpha (`float`, *optional*, defaults to `0.6`):
|
||||
The alpha parameter for the beta distribution.
|
||||
beta (`float`, *optional*, defaults to `0.6`):
|
||||
The beta parameter for the beta distribution.
|
||||
|
||||
Returns:
|
||||
`torch.Tensor`:
|
||||
The converted sigma values following a beta distribution schedule.
|
||||
"""
|
||||
"""From "Beta Sampling is All You Need" [arXiv:2407.12173] (Lee et. al, 2024)"""
|
||||
|
||||
# Hack to make sure that other schedulers which copy this function don't break
|
||||
# TODO: Add this logic to the other schedulers
|
||||
|
||||
@@ -108,7 +108,6 @@ from .import_utils import (
|
||||
is_tensorboard_available,
|
||||
is_timm_available,
|
||||
is_torch_available,
|
||||
is_torch_mlu_available,
|
||||
is_torch_npu_available,
|
||||
is_torch_version,
|
||||
is_torch_xla_available,
|
||||
|
||||
@@ -42,7 +42,7 @@ HF_MODULES_CACHE = os.getenv("HF_MODULES_CACHE", os.path.join(HF_HOME, "modules"
|
||||
DEPRECATED_REVISION_ARGS = ["fp16", "non-ema"]
|
||||
DIFFUSERS_REQUEST_TIMEOUT = 60
|
||||
DIFFUSERS_ATTN_BACKEND = os.getenv("DIFFUSERS_ATTN_BACKEND", "native")
|
||||
DIFFUSERS_ATTN_CHECKS = os.getenv("DIFFUSERS_ATTN_CHECKS", "0").upper() in ENV_VARS_TRUE_VALUES
|
||||
DIFFUSERS_ATTN_CHECKS = os.getenv("DIFFUSERS_ATTN_CHECKS", "0") in ENV_VARS_TRUE_VALUES
|
||||
DEFAULT_HF_PARALLEL_LOADING_WORKERS = 8
|
||||
HF_ENABLE_PARALLEL_LOADING = os.environ.get("HF_ENABLE_PARALLEL_LOADING", "").upper() in ENV_VARS_TRUE_VALUES
|
||||
DIFFUSERS_DISABLE_REMOTE_CODE = os.getenv("DIFFUSERS_DISABLE_REMOTE_CODE", "false").upper() in ENV_VARS_TRUE_VALUES
|
||||
|
||||
@@ -1623,21 +1623,6 @@ class VQModel(metaclass=DummyObject):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
|
||||
class WanAnimateTransformer3DModel(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch"])
|
||||
|
||||
|
||||
class WanTransformer3DModel(metaclass=DummyObject):
|
||||
_backends = ["torch"]
|
||||
|
||||
|
||||
@@ -3512,21 +3512,6 @@ class VQDiffusionPipeline(metaclass=DummyObject):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class WanAnimatePipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
requires_backends(self, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_config(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
@classmethod
|
||||
def from_pretrained(cls, *args, **kwargs):
|
||||
requires_backends(cls, ["torch", "transformers"])
|
||||
|
||||
|
||||
class WanImageToVideoPipeline(metaclass=DummyObject):
|
||||
_backends = ["torch", "transformers"]
|
||||
|
||||
|
||||
@@ -192,7 +192,6 @@ except importlib_metadata.PackageNotFoundError:
|
||||
|
||||
_torch_xla_available, _torch_xla_version = _is_package_available("torch_xla")
|
||||
_torch_npu_available, _torch_npu_version = _is_package_available("torch_npu")
|
||||
_torch_mlu_available, _torch_mlu_version = _is_package_available("torch_mlu")
|
||||
_transformers_available, _transformers_version = _is_package_available("transformers")
|
||||
_hf_hub_available, _hf_hub_version = _is_package_available("huggingface_hub")
|
||||
_kernels_available, _kernels_version = _is_package_available("kernels")
|
||||
@@ -244,10 +243,6 @@ def is_torch_npu_available():
|
||||
return _torch_npu_available
|
||||
|
||||
|
||||
def is_torch_mlu_available():
|
||||
return _torch_mlu_available
|
||||
|
||||
|
||||
def is_flax_available():
|
||||
return _flax_available
|
||||
|
||||
|
||||
@@ -20,7 +20,7 @@ import os
|
||||
from typing import Callable, Dict, List, Optional, Tuple, Union
|
||||
|
||||
from . import logging
|
||||
from .import_utils import is_torch_available, is_torch_mlu_available, is_torch_npu_available, is_torch_version
|
||||
from .import_utils import is_torch_available, is_torch_npu_available, is_torch_version
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
@@ -242,8 +242,8 @@ def fourier_filter(x_in: "torch.Tensor", threshold: int, scale: int) -> "torch.T
|
||||
def apply_freeu(
|
||||
resolution_idx: int, hidden_states: "torch.Tensor", res_hidden_states: "torch.Tensor", **freeu_kwargs
|
||||
) -> Tuple["torch.Tensor", "torch.Tensor"]:
|
||||
"""Applies the FreeU mechanism as introduced in https://huggingface.co/papers/2309.11497. Adapted from the official
|
||||
code repository: https://github.com/ChenyangSi/FreeU.
|
||||
"""Applies the FreeU mechanism as introduced in https:
|
||||
//arxiv.org/abs/2309.11497. Adapted from the official code repository: https://github.com/ChenyangSi/FreeU.
|
||||
|
||||
Args:
|
||||
resolution_idx (`int`): Integer denoting the UNet block where FreeU is being applied.
|
||||
@@ -286,8 +286,6 @@ def get_device():
|
||||
return "xpu"
|
||||
elif torch.backends.mps.is_available():
|
||||
return "mps"
|
||||
elif is_torch_mlu_available():
|
||||
return "mlu"
|
||||
else:
|
||||
return "cpu"
|
||||
|
||||
|
||||
@@ -32,6 +32,20 @@ warnings.simplefilter(action="ignore", category=FutureWarning)
|
||||
|
||||
def pytest_configure(config):
|
||||
config.addinivalue_line("markers", "big_accelerator: marks tests as requiring big accelerator resources")
|
||||
config.addinivalue_line("markers", "lora: marks tests for LoRA/PEFT functionality")
|
||||
config.addinivalue_line("markers", "ip_adapter: marks tests for IP Adapter functionality")
|
||||
config.addinivalue_line("markers", "training: marks tests for training functionality")
|
||||
config.addinivalue_line("markers", "attention: marks tests for attention processor functionality")
|
||||
config.addinivalue_line("markers", "memory: marks tests for memory optimization functionality")
|
||||
config.addinivalue_line("markers", "cpu_offload: marks tests for CPU offloading functionality")
|
||||
config.addinivalue_line("markers", "group_offload: marks tests for group offloading functionality")
|
||||
config.addinivalue_line("markers", "compile: marks tests for torch.compile functionality")
|
||||
config.addinivalue_line("markers", "single_file: marks tests for single file checkpoint loading")
|
||||
config.addinivalue_line("markers", "bitsandbytes: marks tests for BitsAndBytes quantization functionality")
|
||||
config.addinivalue_line("markers", "quanto: marks tests for Quanto quantization functionality")
|
||||
config.addinivalue_line("markers", "torchao: marks tests for TorchAO quantization functionality")
|
||||
config.addinivalue_line("markers", "gguf: marks tests for GGUF quantization functionality")
|
||||
config.addinivalue_line("markers", "modelopt: marks tests for NVIDIA ModelOpt quantization functionality")
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
|
||||
@@ -82,7 +82,3 @@ class AutoencoderDCTests(ModelTesterMixin, AutoencoderTesterMixin, unittest.Test
|
||||
@unittest.skipIf(IS_GITHUB_ACTIONS, reason="Skipping test inside GitHub Actions environment")
|
||||
def test_layerwise_casting_inference(self):
|
||||
super().test_layerwise_casting_inference()
|
||||
|
||||
@unittest.skipIf(IS_GITHUB_ACTIONS, reason="Skipping test inside GitHub Actions environment")
|
||||
def test_layerwise_casting_memory(self):
|
||||
super().test_layerwise_casting_memory()
|
||||
|
||||
@@ -317,9 +317,9 @@ class ModelUtilsTest(unittest.TestCase):
|
||||
repo_id, subfolder="transformer", cache_dir=tmpdir, local_files_only=True
|
||||
)
|
||||
|
||||
assert all(torch.equal(p1, p2) for p1, p2 in zip(model.parameters(), local_model.parameters())), (
|
||||
"Model parameters don't match!"
|
||||
)
|
||||
assert all(
|
||||
torch.equal(p1, p2) for p1, p2 in zip(model.parameters(), local_model.parameters())
|
||||
), "Model parameters don't match!"
|
||||
|
||||
# Remove a shard file
|
||||
cached_shard_file = try_to_load_from_cache(
|
||||
@@ -335,9 +335,9 @@ class ModelUtilsTest(unittest.TestCase):
|
||||
|
||||
# Verify error mentions the missing shard
|
||||
error_msg = str(context.exception)
|
||||
assert cached_shard_file in error_msg or "required according to the checkpoint index" in error_msg, (
|
||||
f"Expected error about missing shard, got: {error_msg}"
|
||||
)
|
||||
assert (
|
||||
cached_shard_file in error_msg or "required according to the checkpoint index" in error_msg
|
||||
), f"Expected error about missing shard, got: {error_msg}"
|
||||
|
||||
@unittest.skip("Flaky behaviour on CI. Re-enable after migrating to new runners")
|
||||
@unittest.skipIf(torch_device == "mps", reason="Test not supported for MPS.")
|
||||
@@ -354,9 +354,9 @@ class ModelUtilsTest(unittest.TestCase):
|
||||
)
|
||||
|
||||
download_requests = [r.method for r in m.request_history]
|
||||
assert download_requests.count("HEAD") == 3, (
|
||||
"3 HEAD requests one for config, one for model, and one for shard index file."
|
||||
)
|
||||
assert (
|
||||
download_requests.count("HEAD") == 3
|
||||
), "3 HEAD requests one for config, one for model, and one for shard index file."
|
||||
assert download_requests.count("GET") == 2, "2 GET requests one for config, one for model"
|
||||
|
||||
with requests_mock.mock(real_http=True) as m:
|
||||
@@ -368,9 +368,9 @@ class ModelUtilsTest(unittest.TestCase):
|
||||
)
|
||||
|
||||
cache_requests = [r.method for r in m.request_history]
|
||||
assert "HEAD" == cache_requests[0] and len(cache_requests) == 2, (
|
||||
"We should call only `model_info` to check for commit hash and knowing if shard index is present."
|
||||
)
|
||||
assert (
|
||||
"HEAD" == cache_requests[0] and len(cache_requests) == 2
|
||||
), "We should call only `model_info` to check for commit hash and knowing if shard index is present."
|
||||
|
||||
def test_weight_overwrite(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdirname, self.assertRaises(ValueError) as error_context:
|
||||
|
||||
@@ -0,0 +1,37 @@
|
||||
from .attention import AttentionTesterMixin
|
||||
from .common import ModelTesterMixin
|
||||
from .compile import TorchCompileTesterMixin
|
||||
from .ip_adapter import IPAdapterTesterMixin
|
||||
from .lora import LoraTesterMixin
|
||||
from .memory import CPUOffloadTesterMixin, GroupOffloadTesterMixin, LayerwiseCastingTesterMixin, MemoryTesterMixin
|
||||
from .quantization import (
|
||||
BitsAndBytesTesterMixin,
|
||||
GGUFTesterMixin,
|
||||
ModelOptTesterMixin,
|
||||
QuantizationTesterMixin,
|
||||
QuantoTesterMixin,
|
||||
TorchAoTesterMixin,
|
||||
)
|
||||
from .single_file import SingleFileTesterMixin
|
||||
from .training import TrainingTesterMixin
|
||||
|
||||
|
||||
__all__ = [
|
||||
"AttentionTesterMixin",
|
||||
"BitsAndBytesTesterMixin",
|
||||
"CPUOffloadTesterMixin",
|
||||
"GGUFTesterMixin",
|
||||
"GroupOffloadTesterMixin",
|
||||
"IPAdapterTesterMixin",
|
||||
"LayerwiseCastingTesterMixin",
|
||||
"LoraTesterMixin",
|
||||
"MemoryTesterMixin",
|
||||
"ModelOptTesterMixin",
|
||||
"ModelTesterMixin",
|
||||
"QuantizationTesterMixin",
|
||||
"QuantoTesterMixin",
|
||||
"SingleFileTesterMixin",
|
||||
"TorchAoTesterMixin",
|
||||
"TorchCompileTesterMixin",
|
||||
"TrainingTesterMixin",
|
||||
]
|
||||
@@ -0,0 +1,180 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from diffusers.models.attention import AttentionModuleMixin
|
||||
from diffusers.models.attention_processor import (
|
||||
AttnProcessor,
|
||||
)
|
||||
|
||||
from ...testing_utils import is_attention, require_accelerator, torch_device
|
||||
|
||||
|
||||
@is_attention
|
||||
@require_accelerator
|
||||
class AttentionTesterMixin:
|
||||
"""
|
||||
Mixin class for testing attention processor and module functionality on models.
|
||||
|
||||
Tests functionality from AttentionModuleMixin including:
|
||||
- Attention processor management (set/get)
|
||||
- QKV projection fusion/unfusion
|
||||
- Attention backends (XFormers, NPU, etc.)
|
||||
|
||||
Expected class attributes to be set by subclasses:
|
||||
- model_class: The model class to test
|
||||
- base_precision: Tolerance for floating point comparisons (default: 1e-3)
|
||||
- uses_custom_attn_processor: Whether model uses custom attention processors (default: False)
|
||||
|
||||
Expected methods to be implemented by subclasses:
|
||||
- get_init_dict(): Returns dict of arguments to initialize the model
|
||||
- get_dummy_inputs(): Returns dict of inputs to pass to the model forward pass
|
||||
|
||||
Pytest mark: attention
|
||||
Use `pytest -m "not attention"` to skip these tests
|
||||
"""
|
||||
|
||||
base_precision = 1e-3
|
||||
|
||||
def test_fuse_unfuse_qkv_projections(self):
|
||||
init_dict = self.get_init_dict()
|
||||
inputs_dict = self.get_dummy_inputs()
|
||||
model = self.model_class(**init_dict)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
if not hasattr(model, "fuse_qkv_projections"):
|
||||
pytest.skip("Model does not support QKV projection fusion.")
|
||||
|
||||
# Get output before fusion
|
||||
with torch.no_grad():
|
||||
output_before_fusion = model(**inputs_dict)
|
||||
if isinstance(output_before_fusion, dict):
|
||||
output_before_fusion = output_before_fusion.to_tuple()[0]
|
||||
|
||||
# Fuse projections
|
||||
model.fuse_qkv_projections()
|
||||
|
||||
# Verify fusion occurred by checking for fused attributes
|
||||
has_fused_projections = False
|
||||
for module in model.modules():
|
||||
if isinstance(module, AttentionModuleMixin):
|
||||
if hasattr(module, "to_qkv") or hasattr(module, "to_kv"):
|
||||
has_fused_projections = True
|
||||
assert module.fused_projections, "fused_projections flag should be True"
|
||||
break
|
||||
|
||||
if has_fused_projections:
|
||||
# Get output after fusion
|
||||
with torch.no_grad():
|
||||
output_after_fusion = model(**inputs_dict)
|
||||
if isinstance(output_after_fusion, dict):
|
||||
output_after_fusion = output_after_fusion.to_tuple()[0]
|
||||
|
||||
# Verify outputs match
|
||||
assert torch.allclose(
|
||||
output_before_fusion, output_after_fusion, atol=self.base_precision
|
||||
), "Output should not change after fusing projections"
|
||||
|
||||
# Unfuse projections
|
||||
model.unfuse_qkv_projections()
|
||||
|
||||
# Verify unfusion occurred
|
||||
for module in model.modules():
|
||||
if isinstance(module, AttentionModuleMixin):
|
||||
assert not hasattr(module, "to_qkv"), "to_qkv should be removed after unfusing"
|
||||
assert not hasattr(module, "to_kv"), "to_kv should be removed after unfusing"
|
||||
assert not module.fused_projections, "fused_projections flag should be False"
|
||||
|
||||
# Get output after unfusion
|
||||
with torch.no_grad():
|
||||
output_after_unfusion = model(**inputs_dict)
|
||||
if isinstance(output_after_unfusion, dict):
|
||||
output_after_unfusion = output_after_unfusion.to_tuple()[0]
|
||||
|
||||
# Verify outputs still match
|
||||
assert torch.allclose(
|
||||
output_before_fusion, output_after_unfusion, atol=self.base_precision
|
||||
), "Output should match original after unfusing projections"
|
||||
|
||||
def test_get_set_processor(self):
|
||||
init_dict = self.get_init_dict()
|
||||
model = self.model_class(**init_dict)
|
||||
model.to(torch_device)
|
||||
|
||||
# Check if model has attention processors
|
||||
if not hasattr(model, "attn_processors"):
|
||||
pytest.skip("Model does not have attention processors.")
|
||||
|
||||
# Test getting processors
|
||||
processors = model.attn_processors
|
||||
assert isinstance(processors, dict), "attn_processors should return a dict"
|
||||
assert len(processors) > 0, "Model should have at least one attention processor"
|
||||
|
||||
# Test that all processors can be retrieved via get_processor
|
||||
for module in model.modules():
|
||||
if isinstance(module, AttentionModuleMixin):
|
||||
processor = module.get_processor()
|
||||
assert processor is not None, "get_processor should return a processor"
|
||||
|
||||
# Test setting a new processor
|
||||
new_processor = AttnProcessor()
|
||||
module.set_processor(new_processor)
|
||||
retrieved_processor = module.get_processor()
|
||||
assert retrieved_processor is new_processor, "Retrieved processor should be the same as the one set"
|
||||
|
||||
def test_attention_processor_dict(self):
|
||||
init_dict = self.get_init_dict()
|
||||
model = self.model_class(**init_dict)
|
||||
model.to(torch_device)
|
||||
|
||||
if not hasattr(model, "set_attn_processor"):
|
||||
pytest.skip("Model does not support setting attention processors.")
|
||||
|
||||
# Get current processors
|
||||
current_processors = model.attn_processors
|
||||
|
||||
# Create a dict of new processors
|
||||
new_processors = {key: AttnProcessor() for key in current_processors.keys()}
|
||||
|
||||
# Set processors using dict
|
||||
model.set_attn_processor(new_processors)
|
||||
|
||||
# Verify all processors were set
|
||||
updated_processors = model.attn_processors
|
||||
for key in current_processors.keys():
|
||||
assert type(updated_processors[key]) == AttnProcessor, f"Processor {key} should be AttnProcessor"
|
||||
|
||||
def test_attention_processor_count_mismatch_raises_error(self):
|
||||
init_dict = self.get_init_dict()
|
||||
model = self.model_class(**init_dict)
|
||||
model.to(torch_device)
|
||||
|
||||
if not hasattr(model, "set_attn_processor"):
|
||||
pytest.skip("Model does not support setting attention processors.")
|
||||
|
||||
# Get current processors
|
||||
current_processors = model.attn_processors
|
||||
|
||||
# Create a dict with wrong number of processors
|
||||
wrong_processors = {list(current_processors.keys())[0]: AttnProcessor()}
|
||||
|
||||
# Verify error is raised
|
||||
with pytest.raises(ValueError) as exc_info:
|
||||
model.set_attn_processor(wrong_processors)
|
||||
|
||||
assert "number of processors" in str(exc_info.value).lower(), "Error should mention processor count mismatch"
|
||||
@@ -0,0 +1,514 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
import os
|
||||
import tempfile
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from accelerate.utils.modeling import _get_proper_dtype, compute_module_sizes, dtype_byte_size
|
||||
|
||||
from diffusers.utils import SAFE_WEIGHTS_INDEX_NAME, _add_variant
|
||||
from diffusers.utils.testing_utils import require_accelerator, require_torch_multi_accelerator
|
||||
|
||||
from ...testing_utils import torch_device
|
||||
|
||||
|
||||
def compute_module_persistent_sizes(
|
||||
model: nn.Module,
|
||||
dtype: Optional[Union[str, torch.device]] = None,
|
||||
special_dtypes: Optional[Dict[str, Union[str, torch.device]]] = None,
|
||||
):
|
||||
"""
|
||||
Compute the size of each submodule of a given model (parameters + persistent buffers).
|
||||
"""
|
||||
if dtype is not None:
|
||||
dtype = _get_proper_dtype(dtype)
|
||||
dtype_size = dtype_byte_size(dtype)
|
||||
if special_dtypes is not None:
|
||||
special_dtypes = {key: _get_proper_dtype(dtyp) for key, dtyp in special_dtypes.items()}
|
||||
special_dtypes_size = {key: dtype_byte_size(dtyp) for key, dtyp in special_dtypes.items()}
|
||||
module_sizes = defaultdict(int)
|
||||
|
||||
module_list = []
|
||||
|
||||
module_list = named_persistent_module_tensors(model, recurse=True)
|
||||
|
||||
for name, tensor in module_list:
|
||||
if special_dtypes is not None and name in special_dtypes:
|
||||
size = tensor.numel() * special_dtypes_size[name]
|
||||
elif dtype is None:
|
||||
size = tensor.numel() * dtype_byte_size(tensor.dtype)
|
||||
elif str(tensor.dtype).startswith(("torch.uint", "torch.int", "torch.bool")):
|
||||
# According to the code in set_module_tensor_to_device, these types won't be converted
|
||||
# so use their original size here
|
||||
size = tensor.numel() * dtype_byte_size(tensor.dtype)
|
||||
else:
|
||||
size = tensor.numel() * min(dtype_size, dtype_byte_size(tensor.dtype))
|
||||
name_parts = name.split(".")
|
||||
for idx in range(len(name_parts) + 1):
|
||||
module_sizes[".".join(name_parts[:idx])] += size
|
||||
|
||||
return module_sizes
|
||||
|
||||
|
||||
def calculate_expected_num_shards(index_map_path):
|
||||
"""
|
||||
Calculate expected number of shards from index file.
|
||||
|
||||
Args:
|
||||
index_map_path: Path to the sharded checkpoint index file
|
||||
|
||||
Returns:
|
||||
int: Expected number of shards
|
||||
"""
|
||||
with open(index_map_path) as f:
|
||||
weight_map_dict = json.load(f)["weight_map"]
|
||||
first_key = list(weight_map_dict.keys())[0]
|
||||
weight_loc = weight_map_dict[first_key] # e.g., diffusion_pytorch_model-00001-of-00002.safetensors
|
||||
expected_num_shards = int(weight_loc.split("-")[-1].split(".")[0])
|
||||
return expected_num_shards
|
||||
|
||||
|
||||
def check_device_map_is_respected(model, device_map):
|
||||
for param_name, param in model.named_parameters():
|
||||
# Find device in device_map
|
||||
while len(param_name) > 0 and param_name not in device_map:
|
||||
param_name = ".".join(param_name.split(".")[:-1])
|
||||
if param_name not in device_map:
|
||||
raise ValueError("device map is incomplete, it does not contain any device for `param_name`.")
|
||||
|
||||
param_device = device_map[param_name]
|
||||
if param_device in ["cpu", "disk"]:
|
||||
assert param.device == torch.device("meta"), f"Expected device 'meta' for {param_name}, got {param.device}"
|
||||
else:
|
||||
assert param.device == torch.device(
|
||||
param_device
|
||||
), f"Expected device {param_device} for {param_name}, got {param.device}"
|
||||
|
||||
|
||||
class ModelTesterMixin:
|
||||
"""
|
||||
Base mixin class for model testing with common test methods.
|
||||
|
||||
Expected class attributes to be set by subclasses:
|
||||
- model_class: The model class to test
|
||||
- main_input_name: Name of the main input tensor (e.g., "sample", "hidden_states")
|
||||
- base_precision: Default tolerance for floating point comparisons (default: 1e-3)
|
||||
|
||||
Expected methods to be implemented by subclasses:
|
||||
- get_init_dict(): Returns dict of arguments to initialize the model
|
||||
- get_dummy_inputs(): Returns dict of inputs to pass to the model forward pass
|
||||
"""
|
||||
|
||||
model_class = None
|
||||
base_precision = 1e-3
|
||||
model_split_percents = [0.5, 0.7]
|
||||
|
||||
def get_init_dict(self):
|
||||
raise NotImplementedError("get_init_dict must be implemented by subclasses. ")
|
||||
|
||||
def get_dummy_inputs(self):
|
||||
raise NotImplementedError(
|
||||
"get_dummy_inputs must be implemented by subclasses. " "It should return inputs_dict."
|
||||
)
|
||||
|
||||
def test_from_save_pretrained(self, expected_max_diff=5e-5):
|
||||
model = self.model_class(**self.get_init_dict())
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
new_model = self.model_class.from_pretrained(tmpdirname)
|
||||
new_model.to(torch_device)
|
||||
|
||||
# check if all parameters shape are the same
|
||||
for param_name in model.state_dict().keys():
|
||||
param_1 = model.state_dict()[param_name]
|
||||
param_2 = new_model.state_dict()[param_name]
|
||||
assert (
|
||||
param_1.shape == param_2.shape
|
||||
), f"Parameter shape mismatch for {param_name}. Original: {param_1.shape}, loaded: {param_2.shape}"
|
||||
|
||||
with torch.no_grad():
|
||||
image = model(**self.get_dummy_inputs())
|
||||
|
||||
if isinstance(image, dict):
|
||||
image = image.to_tuple()[0]
|
||||
|
||||
new_image = new_model(**self.get_dummy_inputs())
|
||||
|
||||
if isinstance(new_image, dict):
|
||||
new_image = new_image.to_tuple()[0]
|
||||
|
||||
max_diff = (image - new_image).abs().max().item()
|
||||
assert (
|
||||
max_diff <= expected_max_diff
|
||||
), f"Models give different forward passes. Max diff: {max_diff}, expected: {expected_max_diff}"
|
||||
|
||||
def test_from_save_pretrained_variant(self, expected_max_diff=5e-5):
|
||||
model = self.model_class(**self.get_init_dict())
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname, variant="fp16")
|
||||
new_model = self.model_class.from_pretrained(tmpdirname, variant="fp16")
|
||||
|
||||
# non-variant cannot be loaded
|
||||
with pytest.raises(OSError) as exc_info:
|
||||
self.model_class.from_pretrained(tmpdirname)
|
||||
|
||||
# make sure that error message states what keys are missing
|
||||
assert "Error no file named diffusion_pytorch_model.bin found in directory" in str(exc_info.value)
|
||||
|
||||
new_model.to(torch_device)
|
||||
|
||||
with torch.no_grad():
|
||||
image = model(**self.get_dummy_inputs())
|
||||
if isinstance(image, dict):
|
||||
image = image.to_tuple()[0]
|
||||
|
||||
new_image = new_model(**self.get_dummy_inputs())
|
||||
|
||||
if isinstance(new_image, dict):
|
||||
new_image = new_image.to_tuple()[0]
|
||||
|
||||
max_diff = (image - new_image).abs().max().item()
|
||||
assert (
|
||||
max_diff <= expected_max_diff
|
||||
), f"Models give different forward passes. Max diff: {max_diff}, expected: {expected_max_diff}"
|
||||
|
||||
def test_from_save_pretrained_dtype(self):
|
||||
model = self.model_class(**self.get_init_dict())
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
for dtype in [torch.float32, torch.float16, torch.bfloat16]:
|
||||
if torch_device == "mps" and dtype == torch.bfloat16:
|
||||
continue
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.to(dtype)
|
||||
model.save_pretrained(tmpdirname)
|
||||
new_model = self.model_class.from_pretrained(tmpdirname, low_cpu_mem_usage=True, torch_dtype=dtype)
|
||||
assert new_model.dtype == dtype
|
||||
if (
|
||||
hasattr(self.model_class, "_keep_in_fp32_modules")
|
||||
and self.model_class._keep_in_fp32_modules is None
|
||||
):
|
||||
# When loading without accelerate dtype == torch.float32 if _keep_in_fp32_modules is not None
|
||||
new_model = self.model_class.from_pretrained(
|
||||
tmpdirname, low_cpu_mem_usage=False, torch_dtype=dtype
|
||||
)
|
||||
assert new_model.dtype == dtype
|
||||
|
||||
def test_determinism(self, expected_max_diff=1e-5):
|
||||
model = self.model_class(**self.get_init_dict())
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
first = model(**self.get_dummy_inputs())
|
||||
if isinstance(first, dict):
|
||||
first = first.to_tuple()[0]
|
||||
|
||||
second = model(**self.get_dummy_inputs())
|
||||
if isinstance(second, dict):
|
||||
second = second.to_tuple()[0]
|
||||
|
||||
# Remove NaN values and compute max difference
|
||||
first_flat = first.flatten()
|
||||
second_flat = second.flatten()
|
||||
|
||||
# Filter out NaN values
|
||||
mask = ~(torch.isnan(first_flat) | torch.isnan(second_flat))
|
||||
first_filtered = first_flat[mask]
|
||||
second_filtered = second_flat[mask]
|
||||
|
||||
max_diff = torch.abs(first_filtered - second_filtered).max().item()
|
||||
assert (
|
||||
max_diff <= expected_max_diff
|
||||
), f"Model outputs are not deterministic. Max diff: {max_diff}, expected: {expected_max_diff}"
|
||||
|
||||
def test_output(self, expected_output_shape=None):
|
||||
model = self.model_class(**self.get_init_dict())
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
inputs_dict = self.get_dummy_inputs()
|
||||
with torch.no_grad():
|
||||
output = model(**inputs_dict)
|
||||
|
||||
if isinstance(output, dict):
|
||||
output = output.to_tuple()[0]
|
||||
|
||||
assert output is not None, "Model output is None"
|
||||
assert (
|
||||
output.shape == expected_output_shape
|
||||
), f"Output shape does not match expected. Expected {expected_output_shape}, got {output.shape}"
|
||||
|
||||
def test_outputs_equivalence(self):
|
||||
def set_nan_tensor_to_zero(t):
|
||||
# Temporary fallback until `aten::_index_put_impl_` is implemented in mps
|
||||
# Track progress in https://github.com/pytorch/pytorch/issues/77764
|
||||
device = t.device
|
||||
if device.type == "mps":
|
||||
t = t.to("cpu")
|
||||
t[t != t] = 0
|
||||
return t.to(device)
|
||||
|
||||
def recursive_check(tuple_object, dict_object):
|
||||
if isinstance(tuple_object, (List, Tuple)):
|
||||
for tuple_iterable_value, dict_iterable_value in zip(tuple_object, dict_object.values()):
|
||||
recursive_check(tuple_iterable_value, dict_iterable_value)
|
||||
elif isinstance(tuple_object, Dict):
|
||||
for tuple_iterable_value, dict_iterable_value in zip(tuple_object.values(), dict_object.values()):
|
||||
recursive_check(tuple_iterable_value, dict_iterable_value)
|
||||
elif tuple_object is None:
|
||||
return
|
||||
else:
|
||||
assert torch.allclose(
|
||||
set_nan_tensor_to_zero(tuple_object), set_nan_tensor_to_zero(dict_object), atol=1e-5
|
||||
), (
|
||||
"Tuple and dict output are not equal. Difference:"
|
||||
f" {torch.max(torch.abs(tuple_object - dict_object))}. Tuple has `nan`:"
|
||||
f" {torch.isnan(tuple_object).any()} and `inf`: {torch.isinf(tuple_object)}. Dict has"
|
||||
f" `nan`: {torch.isnan(dict_object).any()} and `inf`: {torch.isinf(dict_object)}."
|
||||
)
|
||||
|
||||
model = self.model_class(**self.get_init_dict())
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
with torch.no_grad():
|
||||
outputs_dict = model(**self.get_dummy_inputs())
|
||||
outputs_tuple = model(**self.get_dummy_inputs(), return_dict=False)
|
||||
|
||||
recursive_check(outputs_tuple, outputs_dict)
|
||||
|
||||
def test_model_config_to_json_string(self):
|
||||
model = self.model_class(**self.get_init_dict())
|
||||
|
||||
json_string = model.config.to_json_string()
|
||||
assert isinstance(json_string, str), "Config to_json_string should return a string"
|
||||
assert len(json_string) > 0, "JSON string should not be empty"
|
||||
|
||||
@require_accelerator
|
||||
@pytest.mark.skipif(torch_device not in ["cuda", "xpu"])
|
||||
def test_from_save_pretrained_float16_bfloat16(self):
|
||||
model = self.model_class(**self.get_init_dict())
|
||||
model.to(torch_device)
|
||||
fp32_modules = model._keep_in_fp32_modules
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
for torch_dtype in [torch.bfloat16, torch.float16]:
|
||||
model.to(torch_dtype).save_pretrained(tmp_dir)
|
||||
model_loaded = self.model_class.from_pretrained(tmp_dir, torch_dtype=torch_dtype).to(torch_device)
|
||||
|
||||
for name, param in model_loaded.named_parameters():
|
||||
if any(module_to_keep_in_fp32 in name.split(".") for module_to_keep_in_fp32 in fp32_modules):
|
||||
assert param.data.dtype == torch.float32
|
||||
else:
|
||||
assert param.data.dtype == torch_dtype
|
||||
|
||||
with torch.no_grad():
|
||||
output = model(**get_dummy_inputs())
|
||||
output_loaded = model_loaded(**get_dummy_inputs())
|
||||
|
||||
assert torch.allclose(
|
||||
output, output_loaded, atol=1e-4
|
||||
), f"Loaded model output differs for {torch_dtype}"
|
||||
|
||||
@require_accelerator
|
||||
def test_sharded_checkpoints(self):
|
||||
torch.manual_seed(0)
|
||||
config = self.get_init_dict()
|
||||
inputs_dict = self.get_dummy_inputs()
|
||||
model = self.model_class(**config).eval()
|
||||
model = model.to(torch_device)
|
||||
|
||||
base_output = model(**inputs_dict)
|
||||
|
||||
model_size = compute_module_persistent_sizes(model)[""]
|
||||
max_shard_size = int((model_size * 0.75) / (2**10)) # Convert to KB as these test models are small
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
|
||||
assert os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)), "Index file should exist"
|
||||
|
||||
# Check if the right number of shards exists
|
||||
expected_num_shards = calculate_expected_num_shards(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))
|
||||
actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
|
||||
assert (
|
||||
actual_num_shards == expected_num_shards
|
||||
), f"Expected {expected_num_shards} shards, got {actual_num_shards}"
|
||||
|
||||
new_model = self.model_class.from_pretrained(tmp_dir).eval()
|
||||
new_model = new_model.to(torch_device)
|
||||
|
||||
torch.manual_seed(0)
|
||||
inputs_dict_new = self.get_dummy_inputs()
|
||||
new_output = new_model(**inputs_dict_new)
|
||||
|
||||
assert torch.allclose(
|
||||
base_output[0], new_output[0], atol=1e-5
|
||||
), "Output should match after sharded save/load"
|
||||
|
||||
@require_accelerator
|
||||
def test_sharded_checkpoints_with_variant(self):
|
||||
torch.manual_seed(0)
|
||||
config = self.get_init_dict()
|
||||
inputs_dict = self.get_dummy_inputs()
|
||||
model = self.model_class(**config).eval()
|
||||
model = model.to(torch_device)
|
||||
|
||||
base_output = model(**inputs_dict)
|
||||
|
||||
model_size = compute_module_persistent_sizes(model)[""]
|
||||
max_shard_size = int((model_size * 0.75) / (2**10)) # Convert to KB as these test models are small
|
||||
variant = "fp16"
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB", variant=variant)
|
||||
|
||||
index_filename = _add_variant(SAFE_WEIGHTS_INDEX_NAME, variant)
|
||||
assert os.path.exists(
|
||||
os.path.join(tmp_dir, index_filename)
|
||||
), f"Variant index file {index_filename} should exist"
|
||||
|
||||
# Check if the right number of shards exists
|
||||
expected_num_shards = calculate_expected_num_shards(os.path.join(tmp_dir, index_filename))
|
||||
actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
|
||||
assert (
|
||||
actual_num_shards == expected_num_shards
|
||||
), f"Expected {expected_num_shards} shards, got {actual_num_shards}"
|
||||
|
||||
new_model = self.model_class.from_pretrained(tmp_dir, variant=variant).eval()
|
||||
new_model = new_model.to(torch_device)
|
||||
|
||||
torch.manual_seed(0)
|
||||
inputs_dict_new = self.get_dummy_inputs()
|
||||
new_output = new_model(**inputs_dict_new)
|
||||
|
||||
assert torch.allclose(
|
||||
base_output[0], new_output[0], atol=1e-5
|
||||
), "Output should match after variant sharded save/load"
|
||||
|
||||
@require_accelerator
|
||||
def test_sharded_checkpoints_with_parallel_loading(self):
|
||||
import time
|
||||
|
||||
from diffusers.utils import constants
|
||||
|
||||
torch.manual_seed(0)
|
||||
config = self.get_init_dict()
|
||||
inputs_dict = self.get_dummy_inputs()
|
||||
model = self.model_class(**config).eval()
|
||||
model = model.to(torch_device)
|
||||
|
||||
base_output = model(**inputs_dict)
|
||||
|
||||
model_size = compute_module_persistent_sizes(model)[""]
|
||||
max_shard_size = int((model_size * 0.75) / (2**10)) # Convert to KB as these test models are small
|
||||
|
||||
# Save original values to restore after test
|
||||
original_parallel_loading = constants.HF_ENABLE_PARALLEL_LOADING
|
||||
original_parallel_workers = getattr(constants, "HF_PARALLEL_WORKERS", None)
|
||||
|
||||
try:
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
model.cpu().save_pretrained(tmp_dir, max_shard_size=f"{max_shard_size}KB")
|
||||
assert os.path.exists(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME)), "Index file should exist"
|
||||
|
||||
# Check if the right number of shards exists
|
||||
expected_num_shards = calculate_expected_num_shards(os.path.join(tmp_dir, SAFE_WEIGHTS_INDEX_NAME))
|
||||
actual_num_shards = len([file for file in os.listdir(tmp_dir) if file.endswith(".safetensors")])
|
||||
assert (
|
||||
actual_num_shards == expected_num_shards
|
||||
), f"Expected {expected_num_shards} shards, got {actual_num_shards}"
|
||||
|
||||
# Load without parallel loading
|
||||
constants.HF_ENABLE_PARALLEL_LOADING = False
|
||||
start_time = time.time()
|
||||
model_sequential = self.model_class.from_pretrained(tmp_dir).eval()
|
||||
sequential_load_time = time.time() - start_time
|
||||
model_sequential = model_sequential.to(torch_device)
|
||||
|
||||
torch.manual_seed(0)
|
||||
|
||||
# Load with parallel loading
|
||||
constants.HF_ENABLE_PARALLEL_LOADING = True
|
||||
constants.DEFAULT_HF_PARALLEL_LOADING_WORKERS = 2
|
||||
|
||||
start_time = time.time()
|
||||
model_parallel = self.model_class.from_pretrained(tmp_dir).eval()
|
||||
parallel_load_time = time.time() - start_time
|
||||
model_parallel = model_parallel.to(torch_device)
|
||||
|
||||
torch.manual_seed(0)
|
||||
inputs_dict_parallel = self.get_dummy_inputs()
|
||||
output_parallel = model_parallel(**inputs_dict_parallel)
|
||||
|
||||
assert torch.allclose(
|
||||
base_output[0], output_parallel[0], atol=1e-5
|
||||
), "Output should match with parallel loading"
|
||||
|
||||
# Verify parallel loading is faster or at least not significantly slower
|
||||
# For small test models, the difference might be negligible or even slightly slower due to overhead
|
||||
# so we just check that parallel loading completed successfully and outputs match
|
||||
assert (
|
||||
parallel_load_time < sequential_load_time
|
||||
), f"Parallel loading took {parallel_load_time:.4f}s, sequential took {sequential_load_time:.4f}s"
|
||||
finally:
|
||||
# Restore original values
|
||||
constants.HF_ENABLE_PARALLEL_LOADING = original_parallel_loading
|
||||
if original_parallel_workers is not None:
|
||||
constants.HF_PARALLEL_WORKERS = original_parallel_workers
|
||||
|
||||
@require_torch_multi_accelerator
|
||||
def test_model_parallelism(self):
|
||||
if self.model_class._no_split_modules is None:
|
||||
pytest.skip("Test not supported for this model as `_no_split_modules` is not set.")
|
||||
|
||||
config = self.get_init_dict()
|
||||
inputs_dict = self.get_dummy_inputs()
|
||||
model = self.model_class(**config).eval()
|
||||
|
||||
model = model.to(torch_device)
|
||||
|
||||
torch.manual_seed(0)
|
||||
base_output = model(**inputs_dict)
|
||||
|
||||
model_size = compute_module_sizes(model)[""]
|
||||
max_gpu_sizes = [int(p * model_size) for p in self.model_split_percents]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
model.cpu().save_pretrained(tmp_dir)
|
||||
|
||||
for max_size in max_gpu_sizes:
|
||||
max_memory = {0: max_size, 1: model_size * 2, "cpu": model_size * 2}
|
||||
new_model = self.model_class.from_pretrained(tmp_dir, device_map="auto", max_memory=max_memory)
|
||||
# Making sure part of the model will be on GPU 0 and GPU 1
|
||||
assert set(new_model.hf_device_map.values()) == {0, 1}, "Model should be split across GPUs"
|
||||
|
||||
check_device_map_is_respected(new_model, new_model.hf_device_map)
|
||||
|
||||
torch.manual_seed(0)
|
||||
new_output = new_model(**inputs_dict)
|
||||
|
||||
assert torch.allclose(
|
||||
base_output[0], new_output[0], atol=1e-5
|
||||
), "Output should match with model parallelism"
|
||||
@@ -0,0 +1,162 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import gc
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from ...testing_utils import (
|
||||
backend_empty_cache,
|
||||
is_torch_compile,
|
||||
require_accelerator,
|
||||
require_torch_version_greater,
|
||||
torch_device,
|
||||
)
|
||||
|
||||
|
||||
@is_torch_compile
|
||||
@require_accelerator
|
||||
@require_torch_version_greater("2.7.1")
|
||||
class TorchCompileTesterMixin:
|
||||
"""
|
||||
Mixin class for testing torch.compile functionality on models.
|
||||
|
||||
Expected class attributes to be set by subclasses:
|
||||
- model_class: The model class to test
|
||||
- different_shapes_for_compilation: Optional list of (height, width) tuples for dynamic shape testing
|
||||
|
||||
Expected methods to be implemented by subclasses:
|
||||
- get_init_dict(): Returns dict of arguments to initialize the model
|
||||
- get_dummy_inputs(): Returns dict of inputs to pass to the model forward pass
|
||||
|
||||
Pytest mark: compile
|
||||
Use `pytest -m "not compile"` to skip these tests
|
||||
"""
|
||||
|
||||
different_shapes_for_compilation = None
|
||||
|
||||
def setup_method(self):
|
||||
torch.compiler.reset()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def teardown_method(self):
|
||||
torch.compiler.reset()
|
||||
gc.collect()
|
||||
backend_empty_cache(torch_device)
|
||||
|
||||
def test_torch_compile_recompilation_and_graph_break(self):
|
||||
init_dict = self.get_init_dict()
|
||||
inputs_dict = self.get_dummy_inputs()
|
||||
|
||||
model = self.model_class(**init_dict).to(torch_device)
|
||||
model.eval()
|
||||
model = torch.compile(model, fullgraph=True)
|
||||
|
||||
with (
|
||||
torch._inductor.utils.fresh_inductor_cache(),
|
||||
torch._dynamo.config.patch(error_on_recompile=True),
|
||||
torch.no_grad(),
|
||||
):
|
||||
_ = model(**inputs_dict)
|
||||
_ = model(**inputs_dict)
|
||||
|
||||
def test_torch_compile_repeated_blocks(self):
|
||||
if self.model_class._repeated_blocks is None:
|
||||
pytest.skip("Skipping test as the model class doesn't have `_repeated_blocks` set.")
|
||||
|
||||
init_dict = self.get_init_dict()
|
||||
inputs_dict = self.get_dummy_inputs()
|
||||
|
||||
model = self.model_class(**init_dict).to(torch_device)
|
||||
model.eval()
|
||||
model.compile_repeated_blocks(fullgraph=True)
|
||||
|
||||
recompile_limit = 1
|
||||
if self.model_class.__name__ == "UNet2DConditionModel":
|
||||
recompile_limit = 2
|
||||
|
||||
with (
|
||||
torch._inductor.utils.fresh_inductor_cache(),
|
||||
torch._dynamo.config.patch(recompile_limit=recompile_limit),
|
||||
torch.no_grad(),
|
||||
):
|
||||
_ = model(**inputs_dict)
|
||||
_ = model(**inputs_dict)
|
||||
|
||||
def test_compile_with_group_offloading(self):
|
||||
if not self.model_class._supports_group_offloading:
|
||||
pytest.skip("Model does not support group offloading.")
|
||||
|
||||
torch._dynamo.config.cache_size_limit = 10000
|
||||
|
||||
init_dict = self.get_init_dict()
|
||||
inputs_dict = self.get_dummy_inputs()
|
||||
model = self.model_class(**init_dict)
|
||||
model.eval()
|
||||
|
||||
group_offload_kwargs = {
|
||||
"onload_device": torch_device,
|
||||
"offload_device": "cpu",
|
||||
"offload_type": "block_level",
|
||||
"num_blocks_per_group": 1,
|
||||
"use_stream": True,
|
||||
"non_blocking": True,
|
||||
}
|
||||
model.enable_group_offload(**group_offload_kwargs)
|
||||
model.compile()
|
||||
|
||||
with torch.no_grad():
|
||||
_ = model(**inputs_dict)
|
||||
_ = model(**inputs_dict)
|
||||
|
||||
def test_compile_on_different_shapes(self):
|
||||
if self.different_shapes_for_compilation is None:
|
||||
pytest.skip(f"Skipping as `different_shapes_for_compilation` is not set for {self.__class__.__name__}.")
|
||||
torch.fx.experimental._config.use_duck_shape = False
|
||||
|
||||
init_dict = self.get_init_dict()
|
||||
model = self.model_class(**init_dict).to(torch_device)
|
||||
model.eval()
|
||||
model = torch.compile(model, fullgraph=True, dynamic=True)
|
||||
|
||||
for height, width in self.different_shapes_for_compilation:
|
||||
with torch._dynamo.config.patch(error_on_recompile=True), torch.no_grad():
|
||||
inputs_dict = self.get_dummy_inputs(height=height, width=width)
|
||||
_ = model(**inputs_dict)
|
||||
|
||||
def test_compile_works_with_aot(self):
|
||||
from torch._inductor.package import load_package
|
||||
|
||||
init_dict = self.get_init_dict()
|
||||
inputs_dict = self.get_dummy_inputs()
|
||||
|
||||
model = self.model_class(**init_dict).to(torch_device)
|
||||
exported_model = torch.export.export(model, args=(), kwargs=inputs_dict)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
package_path = os.path.join(tmpdir, f"{self.model_class.__name__}.pt2")
|
||||
_ = torch._inductor.aoti_compile_and_package(exported_model, package_path=package_path)
|
||||
assert os.path.exists(package_path), f"Package file not created at {package_path}"
|
||||
loaded_binary = load_package(package_path, run_single_threaded=True)
|
||||
|
||||
model.forward = loaded_binary
|
||||
|
||||
with torch.no_grad():
|
||||
_ = model(**inputs_dict)
|
||||
_ = model(**inputs_dict)
|
||||
@@ -0,0 +1,109 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import tempfile
|
||||
import uuid
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from huggingface_hub.utils import is_jinja_available
|
||||
|
||||
from ...others.test_utils import TOKEN, USER, is_staging_test
|
||||
|
||||
|
||||
@is_staging_test
|
||||
class ModelPushToHubTesterMixin:
|
||||
"""
|
||||
Mixin class for testing push_to_hub functionality on models.
|
||||
|
||||
Expected class attributes to be set by subclasses:
|
||||
- model_class: The model class to test
|
||||
|
||||
Expected methods to be implemented by subclasses:
|
||||
- get_init_dict(): Returns dict of arguments to initialize the model
|
||||
"""
|
||||
|
||||
identifier = uuid.uuid4()
|
||||
repo_id = f"test-model-{identifier}"
|
||||
org_repo_id = f"valid_org/{repo_id}-org"
|
||||
|
||||
def test_push_to_hub(self):
|
||||
"""Test pushing model to hub and loading it back."""
|
||||
init_dict = self.get_init_dict()
|
||||
model = self.model_class(**init_dict)
|
||||
model.push_to_hub(self.repo_id, token=TOKEN)
|
||||
|
||||
new_model = self.model_class.from_pretrained(f"{USER}/{self.repo_id}")
|
||||
for p1, p2 in zip(model.parameters(), new_model.parameters()):
|
||||
assert torch.equal(p1, p2), "Parameters don't match after push_to_hub and from_pretrained"
|
||||
|
||||
# Reset repo
|
||||
delete_repo(token=TOKEN, repo_id=self.repo_id)
|
||||
|
||||
# Push to hub via save_pretrained
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
model.save_pretrained(tmp_dir, repo_id=self.repo_id, push_to_hub=True, token=TOKEN)
|
||||
|
||||
new_model = self.model_class.from_pretrained(f"{USER}/{self.repo_id}")
|
||||
for p1, p2 in zip(model.parameters(), new_model.parameters()):
|
||||
assert torch.equal(
|
||||
p1, p2
|
||||
), "Parameters don't match after save_pretrained with push_to_hub and from_pretrained"
|
||||
|
||||
# Reset repo
|
||||
delete_repo(self.repo_id, token=TOKEN)
|
||||
|
||||
def test_push_to_hub_in_organization(self):
|
||||
"""Test pushing model to hub in organization namespace."""
|
||||
init_dict = self.get_init_dict()
|
||||
model = self.model_class(**init_dict)
|
||||
model.push_to_hub(self.org_repo_id, token=TOKEN)
|
||||
|
||||
new_model = self.model_class.from_pretrained(self.org_repo_id)
|
||||
for p1, p2 in zip(model.parameters(), new_model.parameters()):
|
||||
assert torch.equal(p1, p2), "Parameters don't match after push_to_hub to org and from_pretrained"
|
||||
|
||||
# Reset repo
|
||||
delete_repo(token=TOKEN, repo_id=self.org_repo_id)
|
||||
|
||||
# Push to hub via save_pretrained
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
model.save_pretrained(tmp_dir, push_to_hub=True, token=TOKEN, repo_id=self.org_repo_id)
|
||||
|
||||
new_model = self.model_class.from_pretrained(self.org_repo_id)
|
||||
for p1, p2 in zip(model.parameters(), new_model.parameters()):
|
||||
assert torch.equal(
|
||||
p1, p2
|
||||
), "Parameters don't match after save_pretrained with push_to_hub to org and from_pretrained"
|
||||
|
||||
# Reset repo
|
||||
delete_repo(self.org_repo_id, token=TOKEN)
|
||||
|
||||
def test_push_to_hub_library_name(self):
|
||||
"""Test that library_name in model card is set to 'diffusers'."""
|
||||
if not is_jinja_available():
|
||||
pytest.skip("Model card tests cannot be performed without Jinja installed.")
|
||||
|
||||
init_dict = self.get_init_dict()
|
||||
model = self.model_class(**init_dict)
|
||||
model.push_to_hub(self.repo_id, token=TOKEN)
|
||||
|
||||
model_card = ModelCard.load(f"{USER}/{self.repo_id}", token=TOKEN).data
|
||||
assert (
|
||||
model_card.library_name == "diffusers"
|
||||
), f"Expected library_name 'diffusers', got {model_card.library_name}"
|
||||
|
||||
# Reset repo
|
||||
delete_repo(self.repo_id, token=TOKEN)
|
||||
@@ -0,0 +1,205 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2025 HuggingFace Inc.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import torch
|
||||
|
||||
from diffusers.models.attention_processor import IPAdapterAttnProcessor
|
||||
|
||||
from ...testing_utils import is_ip_adapter, torch_device
|
||||
|
||||
|
||||
def create_ip_adapter_state_dict(model):
|
||||
"""
|
||||
Create a dummy IP Adapter state dict for testing.
|
||||
|
||||
Args:
|
||||
model: The model to create IP adapter weights for
|
||||
|
||||
Returns:
|
||||
dict: IP adapter state dict with to_k_ip and to_v_ip weights
|
||||
"""
|
||||
ip_state_dict = {}
|
||||
key_id = 1
|
||||
|
||||
for name in model.attn_processors.keys():
|
||||
# Skip self-attention processors
|
||||
cross_attention_dim = getattr(model.config, "cross_attention_dim", None)
|
||||
if cross_attention_dim is None:
|
||||
continue
|
||||
|
||||
# Get hidden size based on model architecture
|
||||
hidden_size = getattr(model.config, "hidden_size", cross_attention_dim)
|
||||
|
||||
# Create IP adapter processor to get state dict structure
|
||||
sd = IPAdapterAttnProcessor(
|
||||
hidden_size=hidden_size, cross_attention_dim=cross_attention_dim, scale=1.0
|
||||
).state_dict()
|
||||
|
||||
ip_state_dict.update(
|
||||
{
|
||||
f"{key_id}.to_k_ip.weight": sd["to_k_ip.0.weight"],
|
||||
f"{key_id}.to_v_ip.weight": sd["to_v_ip.0.weight"],
|
||||
}
|
||||
)
|
||||
key_id += 2
|
||||
|
||||
return {"ip_adapter": ip_state_dict}
|
||||
|
||||
|
||||
def check_if_ip_adapter_correctly_set(model) -> bool:
|
||||
"""
|
||||
Check if IP Adapter processors are correctly set in the model.
|
||||
|
||||
Args:
|
||||
model: The model to check
|
||||
|
||||
Returns:
|
||||
bool: True if IP Adapter is correctly set, False otherwise
|
||||
"""
|
||||
for module in model.attn_processors.values():
|
||||
if isinstance(module, IPAdapterAttnProcessor):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
@is_ip_adapter
|
||||
class IPAdapterTesterMixin:
|
||||
"""
|
||||
Mixin class for testing IP Adapter functionality on models.
|
||||
|
||||
Expected class attributes to be set by subclasses:
|
||||
- model_class: The model class to test
|
||||
|
||||
Expected methods to be implemented by subclasses:
|
||||
- get_init_dict(): Returns dict of arguments to initialize the model
|
||||
- get_dummy_inputs(): Returns dict of inputs to pass to the model forward pass
|
||||
|
||||
Pytest mark: ip_adapter
|
||||
Use `pytest -m "not ip_adapter"` to skip these tests
|
||||
"""
|
||||
|
||||
def create_ip_adapter_state_dict(self, model):
|
||||
raise NotImplementedError("child class must implement method to create IPAdapter State Dict")
|
||||
|
||||
def test_load_ip_adapter(self):
|
||||
init_dict = self.get_init_dict()
|
||||
inputs_dict = self.get_dummy_inputs()
|
||||
model = self.model_class(**init_dict).to(torch_device)
|
||||
|
||||
torch.manual_seed(0)
|
||||
output_no_adapter = model(**inputs_dict, return_dict=False)[0]
|
||||
|
||||
# Create dummy IP adapter state dict
|
||||
ip_adapter_state_dict = self.create_ip_adapter_state_dict(model)
|
||||
|
||||
# Load IP adapter
|
||||
model._load_ip_adapter_weights([ip_adapter_state_dict])
|
||||
assert check_if_ip_adapter_correctly_set(model), "IP Adapter processors not set correctly"
|
||||
|
||||
torch.manual_seed(0)
|
||||
# Create dummy image embeds for IP adapter
|
||||
cross_attention_dim = getattr(model.config, "cross_attention_dim", 32)
|
||||
image_embeds = torch.randn(1, 1, cross_attention_dim).to(torch_device)
|
||||
inputs_dict_with_adapter = inputs_dict.copy()
|
||||
inputs_dict_with_adapter["image_embeds"] = image_embeds
|
||||
|
||||
outputs_with_adapter = model(**inputs_dict_with_adapter, return_dict=False)[0]
|
||||
|
||||
assert not torch.allclose(
|
||||
output_no_adapter, outputs_with_adapter, atol=1e-4, rtol=1e-4
|
||||
), "Output should differ with IP Adapter enabled"
|
||||
|
||||
def test_ip_adapter_scale(self):
|
||||
init_dict = self.get_init_dict()
|
||||
inputs_dict = self.get_dummy_inputs()
|
||||
model = self.model_class(**init_dict).to(torch_device)
|
||||
|
||||
# Create and load dummy IP adapter state dict
|
||||
ip_adapter_state_dict = create_ip_adapter_state_dict(model)
|
||||
model._load_ip_adapter_weights([ip_adapter_state_dict])
|
||||
|
||||
# Test scale = 0.0 (no effect)
|
||||
model.set_ip_adapter_scale(0.0)
|
||||
torch.manual_seed(0)
|
||||
output_scale_zero = model(**inputs_dict_with_adapter, return_dict=False)[0]
|
||||
|
||||
# Test scale = 1.0 (full effect)
|
||||
model.set_ip_adapter_scale(1.0)
|
||||
torch.manual_seed(0)
|
||||
output_scale_one = model(**inputs_dict_with_adapter, return_dict=False)[0]
|
||||
|
||||
# Outputs should differ with different scales
|
||||
assert not torch.allclose(
|
||||
output_scale_zero, output_scale_one, atol=1e-4, rtol=1e-4
|
||||
), "Output should differ with different IP Adapter scales"
|
||||
|
||||
def test_unload_ip_adapter(self):
|
||||
init_dict = self.get_init_dict()
|
||||
model = self.model_class(**init_dict).to(torch_device)
|
||||
|
||||
# Save original processors
|
||||
original_processors = {k: type(v).__name__ for k, v in model.attn_processors.items()}
|
||||
|
||||
# Create and load IP adapter
|
||||
ip_adapter_state_dict = create_ip_adapter_state_dict(model)
|
||||
model._load_ip_adapter_weights([ip_adapter_state_dict])
|
||||
assert check_if_ip_adapter_correctly_set(model), "IP Adapter should be set"
|
||||
|
||||
# Unload IP adapter
|
||||
model.unload_ip_adapter()
|
||||
assert not check_if_ip_adapter_correctly_set(model), "IP Adapter should be unloaded"
|
||||
|
||||
# Verify processors are restored
|
||||
current_processors = {k: type(v).__name__ for k, v in model.attn_processors.items()}
|
||||
assert original_processors == current_processors, "Processors should be restored after unload"
|
||||
|
||||
def test_ip_adapter_save_load(self):
|
||||
init_dict = self.get_init_dict()
|
||||
inputs_dict = self.get_dummy_inputs()
|
||||
model = self.model_class(**init_dict).to(torch_device)
|
||||
|
||||
# Create and load IP adapter
|
||||
ip_adapter_state_dict = self.create_ip_adapter_state_dict()
|
||||
model._load_ip_adapter_weights([ip_adapter_state_dict])
|
||||
|
||||
torch.manual_seed(0)
|
||||
output_before_save = model(**inputs_dict, return_dict=False)[0]
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# Save the IP adapter weights
|
||||
save_path = os.path.join(tmpdir, "ip_adapter.safetensors")
|
||||
import safetensors.torch
|
||||
|
||||
safetensors.torch.save_file(ip_adapter_state_dict["ip_adapter"], save_path)
|
||||
|
||||
# Unload and reload
|
||||
model.unload_ip_adapter()
|
||||
assert not check_if_ip_adapter_correctly_set(model), "IP Adapter should be unloaded"
|
||||
|
||||
# Reload from saved file
|
||||
loaded_state_dict = {"ip_adapter": safetensors.torch.load_file(save_path)}
|
||||
model._load_ip_adapter_weights([loaded_state_dict])
|
||||
assert check_if_ip_adapter_correctly_set(model), "IP Adapter should be loaded"
|
||||
|
||||
torch.manual_seed(0)
|
||||
output_after_load = model(**inputs_dict_with_adapter, return_dict=False)[0]
|
||||
|
||||
# Outputs should match before and after save/load
|
||||
assert torch.allclose(
|
||||
output_before_save, output_after_load, atol=1e-4, rtol=1e-4
|
||||
), "Output should match before and after save/load"
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user